In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

import pandas as pd

from datetime import datetime
from tqdm import tqdm

import random

In [2]:
# torch.autograd.set_detect_anomaly(True)

user_features = pd.read_parquet('../datasets/user_features_clean.parquet')
ratings_groupped_ids = pd.read_parquet('../datasets/ratings_groupped_ids.parquet')

In [3]:
print(user_features.info())
print(ratings_groupped_ids.info())

empty_pos_ratings = ratings_groupped_ids['pos'].apply(lambda x: len(x) == 0).sum()
empty_neg_ratings = ratings_groupped_ids['neg'].apply(lambda x: len(x) == 0).sum()

if empty_pos_ratings != 0 or empty_neg_ratings != 0:
    print(f'Empty ratings: pos: {empty_pos_ratings}, neg: {empty_neg_ratings}')
    raise Exception("Users without a single pos/neg rating exist in the ratings_groupped_ids dataset")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198832 entries, 0 to 198831
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   userId                   198832 non-null  int64  
 1   num_rating               198832 non-null  float64
 2   avg_rating               198832 non-null  float64
 3   weekend_watcher          198832 non-null  float64
 4   genre_Action             198832 non-null  float64
 5   genre_Adventure          198832 non-null  float64
 6   genre_Animation          198832 non-null  float64
 7   genre_Comedy             198832 non-null  float64
 8   genre_Crime              198832 non-null  float64
 9   genre_Documentary        198832 non-null  float64
 10  genre_Drama              198832 non-null  float64
 11  genre_Family             198832 non-null  float64
 12  genre_Fantasy            198832 non-null  float64
 13  genre_History            198832 non-null  float64
 14  genr

# Mapowanie movieId do ciągłego przedziału liczb naturalnych, aby umożliwić użycie nn.Embedding

In [4]:
unique_ids = set(
        user_features['movies_seq'].explode().tolist()
        + ratings_groupped_ids['pos'].explode().tolist() 
        + ratings_groupped_ids['neg'].explode().tolist()
    )

print('Unique movieIds:', len(unique_ids))
unique_ids = sorted(unique_ids)

movieId_to_idx = {id_: idx for idx, id_ in enumerate(unique_ids)}
print('min idx:', min(movieId_to_idx.values()))
print('max idx:', max(movieId_to_idx.values()))

n_items = len(unique_ids)

assert min(movieId_to_idx.values()) == 0
assert max(movieId_to_idx.values()) == n_items - 1

Unique movieIds: 82932
min idx: 0
max idx: 82931


In [5]:
# Convert movieIds in ratings_groupped_ids to the ones accepted by nn.Embedding
def map_list(col):
    return [movieId_to_idx[m] for m in col]

for df, col in [
    (user_features, 'movies_seq'),
    (ratings_groupped_ids, 'pos'),
    (ratings_groupped_ids, 'neg')]:
    df[col] = df[col].apply(map_list)


max_idx = max(movieId_to_idx.values())
assert all(0 <= id_ <= max_idx for l in ratings_groupped_ids['pos'] for id_ in l)
assert all(0 <= id_ <= max_idx for l in ratings_groupped_ids['neg'] for id_ in l)

In [6]:
class UserDataset(Dataset):
    def __init__(self, user_features):
        self.data = user_features
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]



In [7]:
EMB_DIM = 64

class UserTower(nn.Module):
    def __init__(self, input_dim, n_items, embedding_dim=EMB_DIM):
        '''
        input_dim - the number of columns in user features, without sequence columns
        '''
        super().__init__()

        self.item_emb = nn.Embedding(n_items, embedding_dim)

        # A layer to project rating and timestamp into a scalar weight
        self.rating_proj = nn.Linear(2, 1)

        self.mlp = nn.Sequential(
            nn.Linear(input_dim + embedding_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 384),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )
    
    def forward(self, batch):
        # Embed movieIds liked by user
        m = self.item_emb(batch['movies'])

        # Get weights 
        x = torch.stack([batch['ratings'], batch['timestamps']], dim=-1)
        w = torch.sigmoid(self.rating_proj(x))

        # weighted mean-pool
        pooled = (m * w).sum(1) / (w.sum(1).clamp_min(1e-6))

        input = torch.cat([batch['user_features'], pooled], dim=-1)
        output = self.mlp(input)
        u = F.normalize(output, dim = 1)
        return u


In [8]:
n_items = len(unique_ids)

def collate(batch):
    user_features, movies, ratings, timestamps, pos, neg = [], [], [], [], [], []

    for row in batch:
        movies.append(torch.tensor(row['movies_seq'], dtype=torch.long))
        ratings.append(torch.tensor(row['ratings_seq'], dtype=torch.float32))
        timestamps.append(torch.tensor(row['ts_seq'], dtype=torch.float32))

        userId = row['userId']

        r = row[['num_rating', 'avg_rating', 'weekend_watcher', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Music', 'genre_Mystery', 'genre_Romance', 'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western', 'type_of_viewer_negative', 'type_of_viewer_neutral', 'type_of_viewer_positive']]
        r = r.astype('float32').values
        user_features.append(torch.tensor(r, dtype=torch.float32))
        
        # Get a random movieId that was rated positively and one that was rated negatively. 
        # Used during training to calculate BPR loss. 
        posAndNegRow = ratings_groupped_ids[ratings_groupped_ids['userId'] == userId].iloc[0]
        pos.append(torch.tensor(random.choice(posAndNegRow['pos']), dtype=torch.long))
        neg.append(torch.tensor(random.choice(posAndNegRow['neg']), dtype=torch.long))

    return {
        "input": {
            "user_features": torch.stack(user_features),
            "movies": torch.stack(movies),
            "ratings": torch.stack(ratings),
            "timestamps": torch.stack(timestamps),
        },
        "pos": torch.as_tensor(pos, dtype=torch.long),
        "neg": torch.as_tensor(neg, dtype=torch.long)
    }

In [9]:
BATCH_SIZE = 4096

from sklearn.model_selection import train_test_split

train, test = train_test_split(user_features, test_size=0.2)

trainDataset = UserDataset(train)
trainDataLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)

testDataset = UserDataset(test)
testDataLoader = DataLoader(testDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)

device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.mps.is_available():
    device = torch.device('mps')
print('Device:', device)

Device: cuda


In [10]:
# Commented out because lasts ~3 minutes and passes

# # Verify the data 
# def check_ids(tensor, column):
#     if (tensor < 0).any() or (tensor >= n_items).any():
#         raise ValueError(f"Out of range index in column {column}. Value: {tensor[(tensor<0) | (tensor >= n_items)]}")


# for row in tqdm(dataloader):
#     check_ids(row['input']['movies'], 'input.movies')
#     check_ids(row['pos'], 'pos')
#     check_ids(row['neg'], 'neg')

In [11]:
import torch.optim as optim

model = UserTower(input_dim=25, n_items=len(unique_ids)).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = F.mse_loss

In [12]:
def to_device(data, device):
    if isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif torch.is_tensor(data):
        return data.to(device)
    else:
        return data

number_of_batches = len(trainDataLoader)

def train_one_epoch():
    running_loss = 0.0
    total = 0

    for i, data in enumerate(trainDataLoader):
        data = to_device(data, device)
        optimizer.zero_grad()

        u = model(data['input'])
        pos_vec = model.item_emb(data['pos'])
        neg_vec = model.item_emb(data['neg'])

        pos_score = (u * pos_vec).sum(dim=-1)
        neg_score = (u * neg_vec).sum(dim=-1)
        # BPR Loss
        loss = -F.logsigmoid(pos_score - neg_score).mean()

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total += 1

        # if i % int(number_of_batches * 0.1) == 0:
        #     print(f'Loss for batch {i}/{number_of_batches}: {running_loss / total:.4f}')
    
    epoch_loss = running_loss / total
    return epoch_loss


In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
EPOCHS = 50

for epoch in tqdm(range(EPOCHS)):
    model.train(True)
    avg_loss = train_one_epoch()

    print(f"Epoch {epoch + 1} loss: {avg_loss}")

    # Evaluation

    if epoch % 5 == 4:
        aucs = []
        pair_acc = []

        model.eval()
        with torch.no_grad():
            for batch in testDataLoader:
                batch = to_device(batch, device)

                u = model(batch['input'])
                pos_vec = model.item_emb(batch['pos'])
                neg_vec = model.item_emb(batch['neg'])

                pos_score = (u * pos_vec).sum(dim = -1)
                neg_score = (u * neg_vec).sum(dim = -1)

                # ROC AUC
                labels = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
                scores = torch.cat([pos_score, neg_score])
                aucs.append(roc_auc_score(labels.cpu(), scores.cpu()))

                # Pair-wise accuarcy
                acc = (pos_score > neg_score).float().mean().item()
                pair_acc.append(acc)

        print(f'Epoch {epoch + 1}. ROC AUC: {float(np.mean(aucs))}, Pair-wise accuaracy: {float(np.mean(pair_acc))}')


            

  2%|▏         | 1/50 [02:26<1:59:46, 146.66s/it]

Epoch 1 loss: 0.8135017477549039


  4%|▍         | 2/50 [04:57<1:59:09, 148.95s/it]

Epoch 2 loss: 0.7545023361841837


  6%|▌         | 3/50 [07:23<1:55:40, 147.67s/it]

Epoch 3 loss: 0.7157965867947309


  8%|▊         | 4/50 [10:18<2:01:24, 158.35s/it]

Epoch 4 loss: 0.6825271600332016
Epoch 5 loss: 0.6592217576809418


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 10%|█         | 5/50 [13:23<2:05:59, 167.99s/it]

Epoch 5. ROC AUC: 0.6409194137389884, Pair-wise accuaracy: nan


 12%|█▏        | 6/50 [15:45<1:56:54, 159.42s/it]

Epoch 6 loss: 0.6396503142821484


 14%|█▍        | 7/50 [18:06<1:49:49, 153.25s/it]

Epoch 7 loss: 0.6220852426993542


 16%|█▌        | 8/50 [20:27<1:44:29, 149.27s/it]

Epoch 8 loss: 0.608815346008692


 18%|█▊        | 9/50 [22:47<1:40:07, 146.52s/it]

Epoch 9 loss: 0.5995213099015064
Epoch 10 loss: 0.5920438292698983


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 20%|██        | 10/50 [25:43<1:43:37, 155.45s/it]

Epoch 10. ROC AUC: 0.6859722882537123, Pair-wise accuaracy: nan


 22%|██▏       | 11/50 [28:03<1:38:01, 150.82s/it]

Epoch 11 loss: 0.5840953695468414


 24%|██▍       | 12/50 [30:25<1:33:48, 148.13s/it]

Epoch 12 loss: 0.5753088394800822


 26%|██▌       | 13/50 [32:46<1:29:56, 145.84s/it]

Epoch 13 loss: 0.5698804442699139


 28%|██▊       | 14/50 [35:06<1:26:35, 144.32s/it]

Epoch 14 loss: 0.5632405449182559
Epoch 15 loss: 0.5602005643722339


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 30%|███       | 15/50 [38:02<1:29:40, 153.73s/it]

Epoch 15. ROC AUC: 0.7058297200599466, Pair-wise accuaracy: nan


 32%|███▏      | 16/50 [40:23<1:24:57, 149.94s/it]

Epoch 16 loss: 0.5541742260639484


 34%|███▍      | 17/50 [42:44<1:20:55, 147.13s/it]

Epoch 17 loss: 0.5507888335448045


 36%|███▌      | 18/50 [45:04<1:17:24, 145.14s/it]

Epoch 18 loss: 0.5447258353233337


 38%|███▊      | 19/50 [47:25<1:14:16, 143.76s/it]

Epoch 19 loss: 0.5398710140815148
Epoch 20 loss: 0.5382317564426324


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 40%|████      | 20/50 [50:20<1:16:36, 153.22s/it]

Epoch 20. ROC AUC: 0.7203887283760824, Pair-wise accuaracy: nan


 42%|████▏     | 21/50 [52:41<1:12:14, 149.48s/it]

Epoch 21 loss: 0.5366536669242077


 44%|████▍     | 22/50 [55:01<1:08:31, 146.83s/it]

Epoch 22 loss: 0.5322171663626646


 46%|████▌     | 23/50 [57:22<1:05:13, 144.96s/it]

Epoch 23 loss: 0.5299181510240604


 48%|████▊     | 24/50 [59:44<1:02:26, 144.09s/it]

Epoch 24 loss: 0.5290594712281839
Epoch 25 loss: 0.5253141690523196


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 50%|█████     | 25/50 [1:02:40<1:04:00, 153.62s/it]

Epoch 25. ROC AUC: 0.7313587820654447, Pair-wise accuaracy: nan


 52%|█████▏    | 26/50 [1:05:02<1:00:02, 150.12s/it]

Epoch 26 loss: 0.5235291398488559


 54%|█████▍    | 27/50 [1:07:21<56:19, 146.93s/it]  

Epoch 27 loss: 0.5214930237867893


 56%|█████▌    | 28/50 [1:09:41<53:04, 144.75s/it]

Epoch 28 loss: 0.5189229662601764


 58%|█████▊    | 29/50 [1:12:02<50:16, 143.62s/it]

Epoch 29 loss: 0.5155243552648104
Epoch 30 loss: 0.5148373995071802


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 60%|██████    | 30/50 [1:14:55<50:49, 152.45s/it]

Epoch 30. ROC AUC: 0.7358693720575293, Pair-wise accuaracy: nan


 62%|██████▏   | 31/50 [1:17:14<47:02, 148.53s/it]

Epoch 31 loss: 0.5139913505468613


 64%|██████▍   | 32/50 [1:19:34<43:44, 145.79s/it]

Epoch 32 loss: 0.5112590422997108


 66%|██████▌   | 33/50 [1:21:53<40:47, 143.97s/it]

Epoch 33 loss: 0.5103360766019577


 68%|██████▊   | 34/50 [1:24:13<38:01, 142.57s/it]

Epoch 34 loss: 0.5079086705660208
Epoch 35 loss: 0.5073491021608695


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 70%|███████   | 35/50 [1:27:06<37:57, 151.82s/it]

Epoch 35. ROC AUC: 0.7421698305324156, Pair-wise accuaracy: nan


 72%|███████▏  | 36/50 [1:29:26<34:33, 148.10s/it]

Epoch 36 loss: 0.5035211772490771


 74%|███████▍  | 37/50 [1:31:45<31:31, 145.47s/it]

Epoch 37 loss: 0.5033631393542657


 76%|███████▌  | 38/50 [1:34:04<28:43, 143.62s/it]

Epoch 38 loss: 0.5032331217557956


 78%|███████▊  | 39/50 [1:36:24<26:06, 142.41s/it]

Epoch 39 loss: 0.49973254402478534
Epoch 40 loss: 0.4974764027656653


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 80%|████████  | 40/50 [1:39:18<25:19, 151.97s/it]

Epoch 40. ROC AUC: 0.7460589047129323, Pair-wise accuaracy: nan


 82%|████████▏ | 41/50 [1:41:38<22:15, 148.38s/it]

Epoch 41 loss: 0.49791375261086684


 84%|████████▍ | 42/50 [1:43:58<19:25, 145.72s/it]

Epoch 42 loss: 0.49649859238893557


 86%|████████▌ | 43/50 [1:46:17<16:47, 143.88s/it]

Epoch 43 loss: 0.4954537191452124


 88%|████████▊ | 44/50 [1:48:37<14:15, 142.54s/it]

Epoch 44 loss: 0.49495463646375215
Epoch 45 loss: 0.49415789200709415


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 90%|█████████ | 45/50 [1:51:31<12:40, 152.08s/it]

Epoch 45. ROC AUC: 0.7499333717754619, Pair-wise accuaracy: nan


 92%|█████████▏| 46/50 [1:53:51<09:53, 148.32s/it]

Epoch 46 loss: 0.49130896345163005


 94%|█████████▍| 47/50 [1:56:10<07:16, 145.66s/it]

Epoch 47 loss: 0.48929214706787694


 96%|█████████▌| 48/50 [1:58:30<04:47, 143.89s/it]

Epoch 48 loss: 0.4922514382081154


 98%|█████████▊| 49/50 [2:00:49<02:22, 142.60s/it]

Epoch 49 loss: 0.4900190432866414
Epoch 50 loss: 0.4890689559471913


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 50/50 [2:03:43<00:00, 148.48s/it]

Epoch 50. ROC AUC: 0.7524894107265517, Pair-wise accuaracy: nan





In [14]:
torch.save(model.state_dict(), f'user_tower_{timestamp}.model')