# 双塔模型
1) Pointwise 训练
2) Pairwise 训练
3) Listwise 训练

In [2]:
%cd /playground/sgd_deep_learning/sgd_rec_sys/
import sys 
sys.path.append('./python')

/playground/sgd_deep_learning/sgd_rec_sys


In [3]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.optim as optim

import pickle
import numpy as np

from sgd_rec_sys.data import FakeDssmDataFactory, DssmDataset
from sgd_rec_sys.retrieval import DSSM, DefaultItemTower, DefaultUserTower, \
TripletHingeLoss, TripletLogisticLoss, CrossEntropyLoss


## 1 Pointwise训练
把召回看做⼆元分类任务。
* 对于正样本，⿎励cos 𝐚, 𝐛 接近+1。
* 对于负样本，⿎励cos 𝐚, 𝐛 接近−1。
* 控制正负样本数量为1: 2或者1: 3。

In [4]:
# 现有框架下，fake数据需要重新构造， 比较简单，暂时pass
# 使用交叉熵预测正负样本(1，0)。 

## 数据准备
## 参数设置

## 2 Pairwise训练

```
    针对pairwise自定义了TripletHingeLoss
        input: 
            cos(a,b+), cos(a,b-)
            两个cos值已在dssm里计算好了

        超参数：
            triplet_hinge_loss_m，需要进行网格搜索
            
            默认设置为1，由于cos的取值在[-1,1], 
            极限条件下正样本为1，负样本为-1，间隔为2，这里取中间间隔1

    from chatgpt:
        在 Triplet Hinge Loss 中，参数 m 是一个超参数，用于控制正例和负例之间的间隔。
        通常情况下，m 的选择会影响模型的性能和训练稳定性。
        
        选择合适的 m 值通常需要根据具体的任务和数据集进行调整和优化。
        一般来说，较小的 m 值会使得模型更加关注于难以区分的样本，从而更好地推动模型向着更好的方向训练。
        但是，如果选择过小的 m 值，可能会导致模型过拟合或训练不稳定。

        相反，较大的 m 值会使得模型更加关注于易于区分的样本，
        从而可能导致模型过度简化或者忽视难以区分的样本。
        但是，选择过大的 m 值可能会导致模型难以收敛或者陷入局部最优。

        因此，选择合适的 m 值需要在模型的训练过程中进行实验和调整。
        您可以尝试使用交叉验证或者网格搜索等方法来选择最佳的 m 值，以优化模型的性能。
        通常情况下，m 的取值范围可以从一个小的正数开始尝试，然后根据实际效果逐步调整。
```

In [5]:
## 数据准备 ##

# input
n_samples = 1000 # 总样本数
user_fea_dim = 80
item_fea_dim = 90
item_fea_num = 2 # （正样本1: 负样本1）

## 生成伪CTR数据
dump_file = './data/fake/tmp_dssm.pkl'
fake_data_factory = FakeDssmDataFactory(n_samples,
                                        user_fea_dim,
                                        item_fea_dim,
                                        item_fea_num,
                                        dtype=np.float32)
fake_data_factory.presist(dump_file)

user feature embedding success, shape: (1000, 80)
item feature embedding success, shape: (1000, 2, 90)


In [6]:
## 参数设置 ##
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device:', device)

# 训练参数
train_batch_size = 64
epochs = 5
triplet_hinge_loss_m = 1 # 超参数需要网格搜索

with open(dump_file,'rb') as f:
    fake_data = pickle.load(f)
train_ds = DssmDataset(data_info=fake_data, device=device)
train_dl = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True)

# 网络结构
out_dim = 128 # 在128维度上做内积或者cosin相似度
user_hidden_dims = [int(user_fea_dim*2), 1024, 256, out_dim]
item_hidden_dims = [int(item_fea_dim*2), 1024, 256, out_dim]


# 定义模型
user_tower = DefaultUserTower(in_dim=user_fea_dim,
                              hidden_dims = user_hidden_dims,
                              activation_fun=nn.ReLU())
                             
item_tower = DefaultItemTower(in_dim=item_fea_dim, 
                              hidden_dims= item_hidden_dims, 
                              activation_fun=nn.ReLU())

model = DSSM(item_tower=item_tower,
             user_tower= user_tower,).to(device)

print(model)

criterion = TripletHingeLoss(m=triplet_hinge_loss_m)  # Pairwise loss
# criterion = TripletLogisticLoss(sigma=1)  # Pairwise loss
optimizer = optim.SGD(model.parameters(), lr=0.01)  # SGD 优化器
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-1, weight_decay=0.3)

device: cuda
DSSM(
  (item_tower): DefaultItemTower(
    (nns): Sequential(
      (0): Linear(in_features=90, out_features=180, bias=True)
      (1): ReLU()
      (2): Linear(in_features=180, out_features=1024, bias=True)
      (3): ReLU()
      (4): Linear(in_features=1024, out_features=256, bias=True)
      (5): ReLU()
      (6): Linear(in_features=256, out_features=128, bias=True)
      (7): ReLU()
    )
  )
  (user_tower): DefaultUserTower(
    (nns): Sequential(
      (0): Linear(in_features=80, out_features=160, bias=True)
      (1): ReLU()
      (2): Linear(in_features=160, out_features=1024, bias=True)
      (3): ReLU()
      (4): Linear(in_features=1024, out_features=256, bias=True)
      (5): ReLU()
      (6): Linear(in_features=256, out_features=128, bias=True)
      (7): ReLU()
    )
  )
)


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
## 训练 ##
def train(dataloader, model, epochs=1,):
    for epoch in range(epochs):
        for x in dataloader:
            # 前向传播
            outputs = model(x)
            loss = criterion(outputs[:,0], outputs[:,1])
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # log
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
            
train(train_dl, model, epochs)

Epoch [1/5], Loss: 1.0012
Epoch [2/5], Loss: 0.9996
Epoch [3/5], Loss: 0.9999
Epoch [4/5], Loss: 1.0002
Epoch [5/5], Loss: 1.0001


## 3 Listwise训练

In [8]:
## 数据准备 ##

# input
n_samples = 1000 # 总样本数
user_fea_dim = 80
item_fea_dim = 90
item_fea_num = 10 # （正样本1: 负样本9）

## 生成伪CTR数据
dump_file = './data/fake/tmp_dssm.pkl'
fake_data_factory = FakeDssmDataFactory(n_samples,
                                        user_fea_dim,
                                        item_fea_dim,
                                        item_fea_num,
                                        dtype=np.float32)
fake_data_factory.presist(dump_file)

user feature embedding success, shape: (1000, 80)
item feature embedding success, shape: (1000, 10, 90)


In [9]:
## 参数设置 ##
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device:', device)

# 训练参数
train_batch_size = 64
epochs = 5

with open(dump_file,'rb') as f:
    fake_data = pickle.load(f)
train_ds = DssmDataset(data_info=fake_data, device=device)
train_dl = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True)

# 网络结构
out_dim = 128 # 在128维度上做内积或者cosin相似度
user_hidden_dims = [int(user_fea_dim*2), 1024, 256, out_dim]
item_hidden_dims = [int(item_fea_dim*2), 1024, 256, out_dim]


# 定义模型
user_tower = DefaultUserTower(in_dim=user_fea_dim,
                              hidden_dims = user_hidden_dims,
                              activation_fun=nn.ReLU())
                             
item_tower = DefaultItemTower(in_dim=item_fea_dim, 
                              hidden_dims= item_hidden_dims, 
                              activation_fun=nn.ReLU())

model = DSSM(item_tower=item_tower,
             user_tower= user_tower,).to(device)

print(model)

criterion = CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)  # SGD 优化器
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-1, weight_decay=0.3)

device: cuda
DSSM(
  (item_tower): DefaultItemTower(
    (nns): Sequential(
      (0): Linear(in_features=90, out_features=180, bias=True)
      (1): ReLU()
      (2): Linear(in_features=180, out_features=1024, bias=True)
      (3): ReLU()
      (4): Linear(in_features=1024, out_features=256, bias=True)
      (5): ReLU()
      (6): Linear(in_features=256, out_features=128, bias=True)
      (7): ReLU()
    )
  )
  (user_tower): DefaultUserTower(
    (nns): Sequential(
      (0): Linear(in_features=80, out_features=160, bias=True)
      (1): ReLU()
      (2): Linear(in_features=160, out_features=1024, bias=True)
      (3): ReLU()
      (4): Linear(in_features=1024, out_features=256, bias=True)
      (5): ReLU()
      (6): Linear(in_features=256, out_features=128, bias=True)
      (7): ReLU()
    )
  )
)


In [10]:
## 训练 ##
def train(dataloader, model, epochs=1,):
    for epoch in range(epochs):
        for x in dataloader:
            # 前向传播
            outputs = model(x)
            loss = criterion(outputs)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # log
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
            
train(train_dl, model, epochs)

Epoch [1/5], Loss: 2.3040
Epoch [2/5], Loss: 2.3029
Epoch [3/5], Loss: 2.3036
Epoch [4/5], Loss: 2.3039
Epoch [5/5], Loss: 2.3008
