PPNet

下述流程，暂不考虑连续特征
- 假设所有连续特征都做了离散化处理
- 实现细节diff参考模型文件注释

In [1]:
%cd /playground/sgd_deep_learning/sgd_rec_sys/
import sys
sys.path.append('./python')

/playground/sgd_deep_learning/sgd_rec_sys


In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.optim as optim

import pickle
import numpy as np

from sgd_rec_sys.cross import PPNet
from sgd_rec_sys.data import FakeCtrDataFactory, CtrDataset011, ctr_collate_fn_011 # 只处理离散特征

## CTR fake data生成
```
 multi-hot的最大采样数可以在FakeCtrDataFactory中设置，默认为4
    def make_multi_hot_fea(self, n_samples, fea_list, max_len=4):
        ...
```

In [3]:
# input
n_samples = 1000 # 总样本数
n_dense_fea = 0 # 0 不生成连续数据
K = 16

# uid、pid、aid + 3个其他特征, 字典大小分别为100,100,100, 60,34,42
one_hot_fea_list = [100, 100, 100, 60, 34, 42]
one_hot_fea_emb_dim = [K] * len(one_hot_fea_list) # fea可设置不同编码维度

multi_hot_fea_list = [10, 20, 30] # 3个fea，字典大小分别为10，20，30
multi_hot_fea_emb_dim = [K] * len(multi_hot_fea_list) # fea可设置不同编码维度

left_emb_dim = sum(one_hot_fea_emb_dim[3:]) + sum(multi_hot_fea_emb_dim)

## 生成伪CTR数据
dump_file = './data/fake/tmp.pkl'
fake_data_factory = FakeCtrDataFactory(n_samples,
                                    n_dense_fea,
                                    one_hot_fea_list,
                                    multi_hot_fea_list,
                                    dtype=np.float32)
fake_data_factory.presist(dump_file)

one-hot feas success, shape: (1000, 6)
multi-hot feas success
label success, shape: (1000,)


## 参数设置

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device:', device)

# 训练参数
train_batch_size = 64
epochs = 5

with open(dump_file,'rb') as f:
    fake_data = pickle.load(f)
train_ds = CtrDataset011(fake_data)
train_dl = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True, collate_fn=ctr_collate_fn_011(device))

# 网络结构
tower_hidden_dims = [left_emb_dim*2, 1024, 256] # deep net 不包括分类层
reduction_ratio = 2 # senet中，类似autoencoder中间表示维度 = input_dim/reduction_ratio

# 定义模型
model = PPNet(tower_hidden_dims, # 保证左右塔相同，右塔单独做condition操作
                 one_hot_fea_list,
                 one_hot_fea_emb_dim,
                 multi_hot_fea_list,    # 每个特征的字典容量
                 multi_hot_fea_emb_dim, # 每个特征的编码维度
                ).to(device)

criterion = nn.BCELoss()  # 二分类交叉熵损失函数
optimizer = optim.SGD(model.parameters(), lr=0.01)  # SGD 优化器
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-1, weight_decay=0.3)


device: cuda


  from .autonotebook import tqdm as notebook_tqdm


## training

In [5]:
def train(dataloader, model, epochs=1,):
    for epoch in range(epochs):
        for x in dataloader:
            y, one_hot_x, multi_hot_x = x
            # 前向传播
            outputs = model((one_hot_x, multi_hot_x))
            loss = criterion(outputs, y.reshape(-1,1))
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # log
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
            
train(train_dl, model, epochs)

Epoch [1/5], Loss: 0.6937
Epoch [2/5], Loss: 0.6934
Epoch [3/5], Loss: 0.6944
Epoch [4/5], Loss: 0.6952
Epoch [5/5], Loss: 0.6951
