In [101]:
import torch.nn as nn
import torch
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

In [102]:
user_features = pd.read_parquet('../datasets/user_features_clean.parquet')
ratings_groupped_ids = pd.read_parquet('../datasets/ratings_groupped_ids.parquet')

ratings_groupped_ids = ratings_groupped_ids.reset_index(drop=True)

In [103]:
print(user_features.info())
print(ratings_groupped_ids.info())

print(ratings_groupped_ids.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200846 entries, 0 to 200845
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   userId                   200846 non-null  int64  
 1   num_rating               200846 non-null  float64
 2   avg_rating               200846 non-null  float64
 3   weekend_watcher          200846 non-null  int64  
 4   genre_Action             200846 non-null  float64
 5   genre_Adventure          200846 non-null  float64
 6   genre_Animation          200846 non-null  float64
 7   genre_Comedy             200846 non-null  float64
 8   genre_Crime              200846 non-null  float64
 9   genre_Documentary        200846 non-null  float64
 10  genre_Drama              200846 non-null  float64
 11  genre_Family             200846 non-null  float64
 12  genre_Fantasy            200846 non-null  float64
 13  genre_History            200846 non-null  float64
 14  genr

In [104]:
RATING_THRESHOLD = 4.0

class UserDataset(Dataset):
    def __init__(self, user_features, grouppedIds):
        triplets = []
        for _, row in user_features.iterrows():
            userId = row['userId']
            if userId not in grouppedIds['userId'].values:
                print(f'User {userId} not found')
                return
            grp = grouppedIds[grouppedIds['userId'] == userId].iloc[0]
            triplets.append({
                "user_features": row,
                "pos": grp['pos'],
                "neg": grp['neg']
            })

        self.data = triplets
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]



In [105]:
EMB_DIM = 64
PAD_ID = 0

class UserTower(nn.Module):
    def __init__(self, input_dim, n_items, embedding_dim=EMB_DIM):
        '''
        input_dim - the number of columns in user features, without sequence columns
        '''
        super().__init__()

        self.item_emb = nn.Embedding(n_items + 1, embedding_dim, padding_idx=PAD_ID)

        # A layer to project rating and timestamp into a scalar weight
        self.rating_proj = nn.Linear(2, 1)

        self.mlp = nn.Sequential(
            nn.Linear(input_dim + embedding_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 384),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )
    
    def forward(self, batch):
        # Embed movieIds liked by user
        m = self.item_emb(batch['movies'])
        print(m)

        # Get weights 
        x = torch.stack([batch['ratings'], batch['timestamps']], dim=-1)
        w = torch.sigmoid(self.rating_proj(x))

        mask = (batch['movies'] != PAD_ID).unsqueeze(-1) # ????????????
        w = w * mask

        # weighted mean-pool
        pooled = (m * w).sum(1) / (w.sum(1).clamp_min(1e-6))

        u = F.normalize(self.mlp(pooled), dim = 1)
        return u


In [106]:
def collate(batch):
    user_features, movies, ratings, timestamps, pos, neg = [], [], [], [], [], []

    for row in batch:
        movies.append(torch.tensor(row['user_features']['movies_seq'], dtype=torch.long))
        ratings.append(torch.tensor(row['user_features']['ratings_seq'], dtype=torch.float32))
        timestamps.append(torch.tensor(row['user_features']['ts_seq'], dtype=torch.float32))
        r = row['user_features'].drop(labels=['movies_seq', 'ratings_seq', 'ts_seq', 'userId'])
        print(r)
        print('typ[e]', type(r))
        user_features.append(torch.tensor(r, dtype=torch.float32))

        pos.append(torch.tensor(row['pos'], dtype=torch.long))
        neg.append(torch.tensor(row['neg'], dtype=torch.long))

    return {
        "input": {
            "user_features": torch.stack(user_features),
            "movies": torch.stack(movies),
            "ratings": torch.stack(ratings),
            "timestamps": torch.stack(timestamps),
        },
        "pos": torch.stack(pos),
        "neg": torch.stack(neg)
    }

In [107]:
BATCH_SIZE = 256

dataset = UserDataset(user_features, ratings_groupped_ids)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True, collate_fn=collate)

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device='mps'
print('Device:', device)

Device: mps


In [108]:
import torch.optim as optim

model = UserTower(input_dim=25, n_items=86477).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = F.mse_loss

In [None]:
def train_one_epoch():
    running_loss = 0.0
    last_loss = 0.0

    for i, data in enumerate(dataloader):
        optimizer.zero_grad()

        u = model(data['input'])
        pos_vec = model.item_emb(data['pos'])
        neg_vec = model.item_emb(data['neg'])

        pos_score = (u * pos_vec).sum(dim=-1)
        neg_score = (u * neg_vec).sum(dim=-1)
        # BPR Loss
        loss = -F.logsigmoid(pos_score - neg_score).mean()

        loss.backward()
        optimizer.step()

        print(f"Loss {loss.item():.4f}")

        running_loss += loss.item()
        if i % BATCH_SIZE == BATCH_SIZE - 1:
            last_loss = running_loss / BATCH_SIZE
            running_loss = 0.0
    return last_loss

In [110]:
from datetime import datetime
from tqdm import tqdm

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
EPOCHS = 5

best_vloss = 1_000_000

for epoch in tqdm(range(EPOCHS)):
    model.train(True)
    avg_loss = train_one_epoch()

    print(f"Epoch {epoch} loss: {avg_loss}")


  0%|          | 0/5 [00:00<?, ?it/s]

num_rating                -0.298912
avg_rating                -0.022342
weekend_watcher                   0
genre_Action               0.257303
genre_Adventure            0.149427
genre_Animation           -1.139792
genre_Comedy               0.170863
genre_Crime                0.042788
genre_Documentary          -0.06783
genre_Drama                -0.36499
genre_Family              -0.971371
genre_Fantasy             -0.203923
genre_History              0.213309
genre_Horror               -0.53511
genre_Music               -0.002231
genre_Mystery             -1.816266
genre_Romance             -1.039161
genre_Science Fiction       0.53288
genre_TV Movie            -0.012766
genre_Thriller            -0.358906
genre_War                 -0.161064
genre_Western              0.415569
type_of_viewer_negative         0.0
type_of_viewer_neutral          1.0
type_of_viewer_positive         0.0
Name: 92198, dtype: object
typ[e] <class 'pandas.core.series.Series'>
num_rating                -0.4

  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(torch.tensor(r, dtype=torch.float32))
  user_features.append(t

RuntimeError: stack expects each tensor to be equal size, but got [44] at entry 0 and [13] at entry 1