In [60]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import TensorDataset

from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

from datetime import datetime
from tqdm import tqdm

import random
from pathlib import Path

from sklearn.model_selection import train_test_split
from itertools import chain

from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm, trange

In [61]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # Tylko do debugging

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device(), torch.cuda.get_device_name(0))

CUDA available: True
Number of GPUs: 1
Current device: 0 NVIDIA GeForce RTX 3060 Ti


In [62]:
BASE_DIR = Path(os.getcwd()).parent
DATA_DIR = BASE_DIR / "data"
df_users = pd.read_parquet(DATA_DIR / 'user_features_clean.parquet')
df_movies = pd.read_parquet(DATA_DIR / 'Movies_clean_Vec_v4_25keywords.parquet')
df_ratings = pd.read_parquet(DATA_DIR / 'ratings_groupped_ids.parquet')

# Przygotowanie movieId dla datasetów

In [63]:
print(df_users.info())
print(df_ratings.info())
print(df_movies.info())

empty_pos_ratings = df_ratings['pos'].apply(lambda x: len(x) == 0).sum()
empty_neg_ratings = df_ratings['neg'].apply(lambda x: len(x) == 0).sum()

if empty_pos_ratings != 0 or empty_neg_ratings != 0:
    print(f'Empty ratings: pos: {empty_pos_ratings}, neg: {empty_neg_ratings}')
    raise Exception("Users without a single pos/neg rating exist in the ratings_groupped_ids dataset")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198832 entries, 0 to 198831
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   userId                   198832 non-null  int64  
 1   num_rating               198832 non-null  float64
 2   avg_rating               198832 non-null  float64
 3   weekend_watcher          198832 non-null  float64
 4   genre_Action             198832 non-null  float64
 5   genre_Adventure          198832 non-null  float64
 6   genre_Animation          198832 non-null  float64
 7   genre_Comedy             198832 non-null  float64
 8   genre_Crime              198832 non-null  float64
 9   genre_Documentary        198832 non-null  float64
 10  genre_Drama              198832 non-null  float64
 11  genre_Family             198832 non-null  float64
 12  genre_Fantasy            198832 non-null  float64
 13  genre_History            198832 non-null  float64
 14  genr

In [64]:
unique_ids = set(
        df_users['movies_seq'].explode().tolist()
        + df_ratings['pos'].explode().tolist() 
        + df_ratings['neg'].explode().tolist()
    )

print('Unique movieIds:', len(unique_ids))
unique_ids = sorted(unique_ids)

movieId_to_idx = {id_: idx for idx, id_ in enumerate(unique_ids)}
print('min idx:', min(movieId_to_idx.values()))
print('max idx:', max(movieId_to_idx.values()))

n_items = len(unique_ids)

assert min(movieId_to_idx.values()) == 0
assert max(movieId_to_idx.values()) == n_items - 1

Unique movieIds: 82932
min idx: 0
max idx: 82931


In [65]:
# Zmapuj movieId do indeksów
df_users['movies_seq'] = df_users['movies_seq'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings['pos'] = df_ratings['pos'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings['neg'] = df_ratings['neg'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings = df_ratings.set_index('userId')

# df_movies musi być ograniczone tylko do używanych filmów
df_movies = df_movies[df_movies['movieId'].isin(movieId_to_idx)].copy()
df_movies['movieId'] = df_movies['movieId'].map(movieId_to_idx)
df_movies = df_movies.set_index('movieId')

# Final sanity check
assert df_users['movies_seq'].explode().max() < n_items
assert df_ratings['pos'].explode().max() < n_items
assert df_ratings['neg'].explode().max() < n_items

assert df_movies.index.max() < n_items
assert df_movies.index.notna().all()

# assert df_movies['movieId'].max() < n_items
# assert df_movies['movieId'].notna().all(), "Some movieIds weren't mapped!"

In [66]:
max_movie_idx = df_users['movies_seq'].explode().max()
print("max_movie_idx =", max_movie_idx)
print("n_items =", n_items)

assert max_movie_idx < n_items, "Indeks filmu przekracza rozmiar embeddingu"

max_movie_idx = 82931
n_items = 82932


In [67]:
def has_invalid_entries(seq_col):
    return seq_col.explode().isin([-1, np.nan, None]).any()

print("Zawiera niepoprawne wartości:", has_invalid_entries(df_users['movies_seq']))

Zawiera niepoprawne wartości: False


In [68]:
df_users.info()
df_users.head(100)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198832 entries, 0 to 198831
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   userId                   198832 non-null  int64  
 1   num_rating               198832 non-null  float64
 2   avg_rating               198832 non-null  float64
 3   weekend_watcher          198832 non-null  float64
 4   genre_Action             198832 non-null  float64
 5   genre_Adventure          198832 non-null  float64
 6   genre_Animation          198832 non-null  float64
 7   genre_Comedy             198832 non-null  float64
 8   genre_Crime              198832 non-null  float64
 9   genre_Documentary        198832 non-null  float64
 10  genre_Drama              198832 non-null  float64
 11  genre_Family             198832 non-null  float64
 12  genre_Fantasy            198832 non-null  float64
 13  genre_History            198832 non-null  float64
 14  genr

Unnamed: 0,userId,num_rating,avg_rating,weekend_watcher,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,...,genre_TV Movie,genre_Thriller,genre_War,genre_Western,type_of_viewer_negative,type_of_viewer_neutral,type_of_viewer_positive,movies_seq,ratings_seq,ts_seq
0,1,-0.068675,-0.347979,0.0,0.926736,-0.375240,-0.179705,-0.402570,0.892727,-1.076257,...,-0.284700,-0.022680,-0.426892,-0.911843,0.0,1.0,0.0,"[24, 1013, 1314, 1360, 1619, 303, 1027, 1190, ...","[-2.3984964034019467, 1.3836304001080941, -2.3...","[-1.2878777024141752, -1.2878663519376752, -1...."
1,2,-0.383417,1.210645,0.0,0.713096,0.940526,1.581734,0.973277,0.410751,0.788073,...,1.018523,1.084306,-1.207360,0.791019,0.0,0.0,1.0,"[30, 191, 273, 545, 234, 577, 503, 216, 376, 2...","[1.3836304001080941, -0.5074330016469263, 0.43...","[-1.709033992628413, -1.709033992628413, -1.70..."
2,3,-0.047456,-0.228499,0.0,0.045472,0.066743,0.383741,-0.669363,-0.692235,-0.211924,...,-0.184799,-0.650122,0.197482,0.094394,0.0,1.0,0.0,"[5218, 4768, 5679, 6196, 3893, 6222, 6391, 516...","[-0.5074330016469263, -0.5074330016469263, -0....","[-0.740135863595513, -0.7401358518778841, -0.7..."
3,4,-0.471828,-2.255334,0.0,-1.763184,-0.917026,-1.363040,-0.993442,-2.681933,-1.620281,...,-1.879511,-2.686105,-1.909780,-1.408750,1.0,0.0,0.0,"[2573, 2589, 2600, 2660, 220, 2612, 2770, 3091...","[-0.5074330016469263, -1.4529647025244365, -1....","[-1.2244648204630921, -1.2244648204630921, -1...."
4,5,-0.450609,-0.895880,0.0,0.178997,-0.375240,-0.002980,-0.993442,-1.908762,-0.675657,...,-0.742820,-0.372255,0.197482,-0.911843,0.0,1.0,0.0,"[228, 312, 159, 288, 314, 324, 429, 9, 183, 25...","[-1.4529647025244365, -0.5074330016469263, 0.4...","[-1.6920817200700495, -1.6920817200700495, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,97,-0.401099,0.439117,1.0,0.112234,0.564918,0.871345,0.478688,0.410751,0.251973,...,0.373421,0.162700,-0.153728,1.771455,0.0,1.0,0.0,"[228, 16, 5087, 3633, 3631, 2063, 1207, 3876, ...","[-0.5074330016469263, 0.43809869923058387, 0.4...","[-0.4013318737005198, -0.40133182683000435, -0..."
96,98,-0.464755,0.131258,0.0,1.514246,0.708332,0.434183,-0.518954,0.410751,0.038055,...,0.116008,0.559944,0.197482,0.105960,0.0,1.0,0.0,"[585, 5815, 5679, 5673, 2747, 2573, 2589, 2596...","[1.3836304001080941, 1.3836304001080941, 0.438...","[-0.9020311080297979, -0.9020306354187667, -0...."
97,99,-0.478901,1.106575,0.0,0.178997,0.301993,1.745670,1.790221,0.410751,0.715759,...,0.931507,0.809640,0.899903,0.724969,0.0,0.0,1.0,"[292, 2463, 2747, 731, 14840, 1166, 1057, 1194...","[-0.034667151208171196, 0.43809869923058387, -...","[0.6478568622869465, 0.6478568740045754, 0.647..."
98,100,0.313259,-0.381486,0.0,-0.544263,-0.397290,-0.574654,-0.042357,-0.095325,-0.318228,...,-0.312716,-0.055048,-0.613003,1.771455,0.0,1.0,0.0,"[3631, 1058, 1336, 1248, 1951, 1326, 15131, 20...","[-0.5074330016469263, -0.034667151208171196, 0...","[1.3519862967238052, 1.3519863514060733, 1.351..."


In [69]:
df_ratings.info()
df_ratings.head(100)

<class 'pandas.core.frame.DataFrame'>
Index: 198832 entries, 1 to 200948
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   pos     198832 non-null  object
 1   neg     198832 non-null  object
dtypes: object(2)
memory usage: 4.6+ MB


Unnamed: 0_level_0,pos,neg
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[16, 29, 31, 79, 109, 164, 174, 229, 257, 298,...","[24, 28, 33, 35, 108, 159, 220, 340, 351, 522,..."
2,"[30, 33, 38, 47, 183, 184, 205, 214, 216, 219,...","[151, 191, 228, 250, 292, 301, 339, 344, 461, ..."
3,"[9, 10, 16, 25, 61, 108, 148, 149, 159, 257, 2...","[1, 47, 139, 151, 156, 166, 183, 206, 228, 324..."
4,"[220, 1232, 2011, 2660, 2731, 3063]","[1172, 1285, 1452, 1732, 2320, 2382, 2491, 249..."
5,"[9, 108, 159, 163, 344, 351, 359, 375, 429, 44...","[46, 148, 151, 183, 206, 228, 250, 285, 288, 2..."
...,...,...
97,"[16, 351, 359, 375, 475, 542, 582, 585, 835, 1...","[148, 205, 206, 228, 1241, 1465, 2063, 2165, 2..."
98,"[585, 878, 882, 894, 1065, 1157, 2016, 2247, 2...","[1, 1808, 2388, 2589, 2660, 2661, 2770, 2886]"
99,"[49, 109, 289, 314, 536, 585, 599, 731, 873, 1...","[292, 2747, 14840]"
100,"[0, 1, 46, 49, 257, 289, 292, 314, 352, 495, 5...","[4, 30, 33, 102, 148, 163, 351, 359, 372, 475,..."


In [70]:
df_movies.info()
df_movies.head(100)

<class 'pandas.core.frame.DataFrame'>
Index: 82918 entries, 14840 to 29526
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   runtime              82918 non-null  float64
 1   if_blockbuster       82918 non-null  int32  
 2   highly_watched       82918 non-null  int32  
 3   highly_rated         82918 non-null  int64  
 4   engagement_score     82918 non-null  float64
 5   cast_importance      82918 non-null  float64
 6   director_score       82918 non-null  float64
 7   has_keywords         82918 non-null  int64  
 8   has_cast             82918 non-null  int64  
 9   has_director         82918 non-null  int64  
 10  genre_ids            82918 non-null  object 
 11  decade_[1890, 1900)  82918 non-null  bool   
 12  decade_[1900, 1910)  82918 non-null  bool   
 13  decade_[1910, 1920)  82918 non-null  bool   
 14  decade_[1920, 1930)  82918 non-null  bool   
 15  decade_[1930, 1940)  82918 non-null  

Unnamed: 0_level_0,runtime,if_blockbuster,highly_watched,highly_rated,engagement_score,cast_importance,director_score,has_keywords,has_cast,has_director,...,"decade_[1960, 1970)","decade_[1970, 1980)","decade_[1980, 1990)","decade_[1990, 2000)","decade_[2000, 2010)","decade_[2010, 2020)","decade_[2020, 2030)",text_embedded,actor_ids,director_ids
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14840,1.942703,1,1,1,4.591432,2.899632,2.653210,1,1,1,...,False,False,False,False,False,True,False,"[0.0008652941, 0.06077885, -0.07869467, -0.067...","[6454, 10631, 5457, 1952, 5950]",[797]
20922,2.432017,1,1,1,5.199338,2.789332,2.653210,1,1,1,...,False,False,False,False,False,True,False,"[-0.010866538, -0.01691181, -0.12693988, -0.04...","[659, 7298, 4974, 10576, 5292]",[797]
12164,2.033104,1,1,1,5.199338,3.099369,2.653210,1,1,1,...,False,False,False,False,True,False,False,"[-0.026262647, 0.055052526, -0.08173301, -0.01...","[1867, 3519, 7812, 1952, 4010]",[797]
14021,2.256745,1,1,1,4.123958,2.512635,2.304477,1,1,1,...,False,False,False,False,True,False,False,"[0.0031084684, -0.032840427, -0.12393689, -0.0...","[11434, 9935, 7629, 9574, 3709]",[2026]
16934,1.824556,1,1,1,5.199338,5.199338,1.817788,1,1,1,...,False,False,False,False,False,True,False,"[-0.015282603, 0.00047473708, -0.11172164, 0.0...","[9686, 1839, 1834, 9161, 4923]",[2496]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159,1.358913,1,1,1,3.003738,1.826226,0.704639,1,1,1,...,False,False,True,False,False,False,False,"[0.016825777, -0.02596536, -0.088541396, 0.058...","[3952, 693, 3373, 7093]",[1915]
4180,-0.276998,1,1,1,3.144047,2.695707,1.599158,1,1,1,...,False,False,False,False,True,False,False,"[0.05683445, -0.0026107691, -0.090642, 0.01653...","[2810, 1497, 7670, 10954, 5022]","[246, 4884]"
11313,0.908975,1,1,1,3.133301,1.566170,1.685018,1,1,1,...,False,False,False,False,True,False,False,"[-0.030460857, -0.0006409547, 0.00023257566, -...","[11136, 8598, 1274, 4226, 8408]",[544]
23316,1.497997,1,1,1,3.111676,2.686499,1.898700,1,1,1,...,False,False,False,False,False,True,False,"[-0.0011530533, 0.014149291, -0.10900124, -0.0...","[9597, 7471, 7114, 2054, 10367]",[3179]


In [71]:
#FOR QUICK TEST's

DEBUG = False

if DEBUG:
    df_users = df_users.sample(n=1028, random_state=42).copy()
    df_ratings = df_ratings[df_ratings['userId'].isin(df_users['userId'])].copy()


# Przygotowanie danych do uczenia -> do gotowych batchy

In [72]:
# For padding 'global max len'

max_len_a = int(df_movies['actor_ids'].str.len().max())
max_len_d = int(df_movies['director_ids'].str.len().max())
max_len_g = int(df_movies['genre_ids'].str.len().max())

In [73]:
# For nn.Embeedings usage in Item Tower
all_actor_ids = list(chain.from_iterable(df_movies['actor_ids']))
num_actors = max(all_actor_ids) + 1

all_director_ids = list(chain.from_iterable(df_movies['director_ids']))
num_directors = max(all_director_ids) + 1

all_genre_ids = list(chain.from_iterable(df_movies['genre_ids']))
num_genres = max(all_genre_ids) + 1

In [74]:
def collect_user_features(u):
        """
        Zwraca cztery tensory: movies_seq, ratings_seq, ts_seq, user_stats
        """
        movies_seq  = torch.tensor(u['movies_seq'], dtype=torch.long)
        ratings_seq = torch.tensor(u['ratings_seq'], dtype=torch.float32)
        ts_seq      = torch.tensor(u['ts_seq'], dtype=torch.float32)
       
        stats_cols  = [c for c in u.index if c.startswith(('num_rating','avg_rating','weekend_watcher','genre_','type_of_viewer_'))]
        user_stats  = torch.tensor(u[stats_cols]
                                        .astype('float32').values,dtype=torch.float32)

        return movies_seq, ratings_seq, ts_seq, user_stats

In [75]:
def collect_movie_features(m, max_len_a, max_len_d, max_len_g):
        """
        Zwraca cztery tensory: combined, actor_ids, director_ids, genre_ids
        """
        numeric = [
            m.runtime,
            m.engagement_score,
            m.cast_importance,
            m.director_score,
        ]
        binary = [
            m.if_blockbuster,
            m.highly_watched,
            m.highly_rated,
            m.has_keywords,
            m.has_cast,
            m.has_director,
        ]
        decades = (m[[c for c in m.index if c.startswith('decade_')]]
                   .astype(int)
                   .tolist())

        dense_feats = torch.tensor(numeric + binary + decades, dtype=torch.float32)
        text_emb = torch.tensor(m.text_embedded, dtype=torch.float32)

        def pad(seq, L):
            seq_list = list(seq) if not isinstance(seq, list) else seq
            padded = seq_list[:L] + [0] * max(0, L - len(seq_list))
            return torch.tensor(padded, dtype=torch.long)

        actor_ids    = pad(m.actor_ids,    max_len_a)
        director_ids = pad(m.director_ids, max_len_d)
        genre_ids    = pad(m.genre_ids,    max_len_g)

        return dense_feats, text_emb, actor_ids, director_ids, genre_ids

In [None]:
import faiss
'''
Do zbudowania macierzy embeedingow dla FAISS, do szyukania najblizszych sasiadow
'''

unique_ids = df_movies.index.tolist()
movie_vecs = []

for m_id in unique_ids:
    dense_feats, text_emb, *_ = collect_movie_features(
        df_movies.loc[m_id],
        max_len_a, max_len_d, max_len_g
    )
    combined = torch.cat([dense_feats, text_emb], dim=0)
    # normalizujemy L2 na potrzeby FAISS cosinusowego (wyplaszczanie)
    movie_vecs.append(F.normalize(combined, dim=0))

movie_matrix = torch.stack(movie_vecs)  # macierz [n_movies, D]
movie_matrix_np = movie_matrix.cpu().numpy().astype('float32')
# FAISS IP po L2-normalizacji = cosine similarity
faiss_index = faiss.IndexFlatIP(movie_matrix_np.shape[1])
faiss_index.add(movie_matrix_np)

In [57]:
# DO OCENY I EWENTUALNYCH ZMIAN
def find_negative(pos_id, user_negs, top_k=25):
    """
    Dla danego pozytywu (pos_id) szuka w FAISS najbliższego negatywu z listy user_negs. Jeśli żaden z top_k nie należy do user_negs to fallback = losowy wybór z user_negs.
    """
    # Zakladamy co najmniej jeden pos_id
    D, I = faiss_index.search(movie_matrix_np[pos_id].reshape(1, -1), top_k)

    for candidate in I[0]:
        if candidate in user_negs:
            return candidate

    return random.choice(list(user_negs))  # fallback

In [58]:
class TwoTowerDataset(Dataset):

    def __init__(self, df_users, df_ratings, df_movies):
        self.df_users = df_users.reset_index(drop=True)
        self.df_ratings = df_ratings
        self.df_movies = df_movies

    def __len__(self):
        return len(self.df_users)

    def __getitem__(self, idx):
        # User features
        u_row = self.df_users.iloc[idx]
        movies_seq, ratings_seq, ts_seq, user_stats = collect_user_features(u_row)
        user_id = u_row['userId']

        pos_list = self.df_ratings.at[user_id, 'pos']
        neg_list = self.df_ratings.at[user_id, 'neg']

        #BPR
        pos_id = random.choice(pos_list)
        neg_id = find_negative(pos_id,set(neg_list))

        m_pos = self.df_movies.loc[pos_id]
        m_neg = self.df_movies.loc[neg_id]

        # Movie features
        pos_feats, pos_text, pos_actors, pos_directors, pos_genres = collect_movie_features(m_pos, max_len_a, max_len_d, max_len_g)
        neg_feats, neg_text, neg_actors, neg_directors, neg_genres = collect_movie_features(m_neg, max_len_a, max_len_d, max_len_g)

        return {
            'user': {
                'user_statistics': user_stats,
                'movies': movies_seq,
                'ratings': ratings_seq,
                'times': ts_seq,
            },
            'pos_item': {
                'dense_features': pos_feats,
                'text_embedding': pos_text,
                'actor_ids': pos_actors,
                'director_ids': pos_directors,
                'genre_ids': pos_genres,
            },
            'neg_item': {
                'dense_features': neg_feats,
                'text_embedding': neg_text,
                'actor_ids': neg_actors,
                'director_ids': neg_directors,
                'genre_ids': neg_genres,
            }
        }

In [59]:
'''
TEST DATASETU I ODPOWIEDNIEGO OUTPUTU POJEDYNCZEGO OBIEKTU GET_ITEM
'''
dataset_test = TwoTowerDataset(df_users, df_ratings, df_movies)

sample0 = dataset_test[0]

print("Keys:", sample0.keys())
print("\n--- USER ---")
for k,v in sample0['user'].items():
    print(f" user[{k}]:", type(v), getattr(v, "shape", v[:5] if isinstance(v,list) else v))

print("\n--- POS ITEM ---")
for k,v in sample0['pos_item'].items():
    print(f" pos_item[{k}]:", type(v), v.shape if hasattr(v,'shape') else v[:5])

print("\n--- NEG ITEM ---")
for k,v in sample0['neg_item'].items():
    print(f" neg_item[{k}]:", type(v), v.shape if hasattr(v,'shape') else v[:5])

KeyError: "None of ['userId'] are in the columns"

In [21]:
def collate_TT(batch):
    '''
    Pelny batchowanie danych do uczenia
    '''
    user_movies, user_ratings, user_times, user_stats = [], [], [], []
    pos_dense, pos_text, pos_actor, pos_director, pos_genre = [], [], [], [], []
    neg_dense, neg_text, neg_actor, neg_director, neg_genre = [], [], [], [], []

    for row in batch:

        user_stats.append(row['user']['user_statistics'])
        user_movies.append(row['user']['movies'])
        user_ratings.append(row['user']['ratings'])
        user_times.append(row['user']['times'])

        pos_dense.append(row['pos_item']['dense_features'])
        pos_text.append(row['pos_item']['text_embedding'])
        pos_actor.append(row['pos_item']['actor_ids'])
        pos_director.append(row['pos_item']['director_ids'])
        pos_genre.append(row['pos_item']['genre_ids'])

        neg_dense.append(row['neg_item']['dense_features'])
        neg_text.append(row['neg_item']['text_embedding'])
        neg_actor.append(row['neg_item']['actor_ids'])
        neg_director.append(row['neg_item']['director_ids'])
        neg_genre.append(row['neg_item']['genre_ids'])

    batch_user = {
        'user_statistics': torch.stack(user_stats),     # [B, d_stats]
        'movies': torch.stack(user_movies),             # [B, L_u]
        'ratings': torch.stack(user_ratings),           # [B, L_u]
        'times': torch.stack(user_times),               # [B, L_u]
    }

    batch_pos_item = {
        'dense_features': torch.stack(pos_dense),# [B, dense_feat_dim]
        'text_embedding': torch.stack(pos_text),         # [B, text_emb_dim]
        'actor_ids': torch.stack(pos_actor),     # [B, max_len_a]
        'director_ids':torch.stack(pos_director),# [B, max_len_d]
        'genre_ids': torch.stack(pos_genre),     # [B, max_len_g]
    }

    batch_neg_item = {
        'dense_features': torch.stack(neg_dense),
        'text_embedding': torch.stack(neg_text),
        'actor_ids': torch.stack(neg_actor),
        'director_ids': torch.stack(neg_director),
        'genre_ids': torch.stack(neg_genre),
    }

    return {
      'user': batch_user,
      'pos_item': batch_pos_item,
      'neg_item': batch_neg_item
    }

In [22]:
def collateUser(batch):
    '''
    Przygotowujemy batch zawierajace dane tylko user-a, potrzebne do leave-one-out
    '''
    movies, ratings, times, stats = [], [], [], []

    for row in batch:

        movies.append(row['user']['movies'])
        ratings.append(row['user']['ratings'])
        times.append(row['user']['times'])
        stats.append(row['user']['user_statistics'])

    return {
        'user': {
            'user_statistics': torch.stack(stats),  # [B, d_stats]
            'movies': torch.stack(movies),          # [B, L_u]
            'ratings': torch.stack(ratings),        # [B, L_u]
            'times': torch.stack(times)             # [B, L_u]
        }
    }

# Przygotowanie zbiorów do treningu

In [23]:
BATCH_SIZE = 4 # FOR TEST: 4
train_users, val_users = train_test_split(
    df_users,
    test_size=0.2,
    random_state=213
)

mask_train = df_ratings.index.isin(train_users['userId'])
train_ratings = df_ratings[mask_train].copy()

mask_val = df_ratings.index.isin(val_users['userId'])
val_ratings = df_ratings[mask_val].copy()

In [24]:
'''
Tworzymy do pozniejszej walidacji leave-one-out w heavy_evaluate
'''
mask = val_ratings['pos'].apply(lambda lst: len(lst) >= 2)
val_ratings = val_ratings[mask].copy()

val_loocv = []

for user_id, row in val_ratings.iterrows():
    pos_list = row['pos']           # wwszytkie pos (wieksze od > 2)
    hold = pos_list[-1]             # Bierzemy ostatni do hold-out
    train = pos_list[:-1]

    val_loocv.append({'userId': user_id, 'pos': [hold]})
    val_ratings.at[user_id, 'pos'] = train

val_loocv = pd.DataFrame(val_loocv).set_index('userId')

In [25]:
'''
Przygotowujemy dane potrzebne do leave-one-out
'''
train_pos_sets = {
    user_id: set(lst)
    for user_id, lst in train_ratings['pos'].items()
}

test_pos = val_loocv['pos'].to_dict()

all_user_ids = list(val_ratings.index)

In [26]:
train_dataset = TwoTowerDataset(
    train_users,
    train_ratings.reset_index(),    # tutaj df_ratings=pos do treningu
    df_movies
)
val_dataset = TwoTowerDataset(
    val_users,
    val_loocv.reset_index(),        # tu trzymamy tylko hold-out w kolumnie pos
    df_movies
)

In [27]:
train_loader = DataLoader(
    dataset       = train_dataset,
    batch_size    = BATCH_SIZE,
    shuffle       = True,
    num_workers   = 4,
    pin_memory    = True,
    collate_fn    = collate_TT,
    drop_last     = False
)
val_loader = DataLoader(
    dataset       = val_dataset,
    batch_size    = BATCH_SIZE,
    shuffle       = False,  # Musi byc FALSE
    num_workers   = 4,
    pin_memory    = True,
    collate_fn    = collateUser,
    drop_last     = False
)

In [28]:
'''
TEST CUSTOMOWEJ FUNKCJI collateTT I DATALOADER-OW
'''
device = torch.device("cuda")
dataset_test = TwoTowerDataset(df_users, df_ratings, df_movies)

loader_test_full = DataLoader(
    dataset_test,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_TT,
)

batch_test = next(iter(loader_test_full))

print("=== USER ===")
for k,v in batch_test['user'].items():
    print(f"{k:10s}:", v.shape)

print("\n=== POS ITEM ===")
for k,v in batch_test['pos_item'].items():
    print(f"{k:15s}:", v.shape)

print("\n=== NEG ITEM ===")
for k,v in batch_test['neg_item'].items():
    print(f"{k:15s}:", v.shape)

=== USER ===
user_statistics: torch.Size([4, 25])
movies    : torch.Size([4, 20])
ratings   : torch.Size([4, 20])
times     : torch.Size([4, 20])

=== POS ITEM ===
dense_features : torch.Size([4, 24])
text_embedding : torch.Size([4, 300])
actor_ids      : torch.Size([4, 5])
director_ids   : torch.Size([4, 3])
genre_ids      : torch.Size([4, 9])

=== NEG ITEM ===
dense_features : torch.Size([4, 24])
text_embedding : torch.Size([4, 300])
actor_ids      : torch.Size([4, 5])
director_ids   : torch.Size([4, 3])
genre_ids      : torch.Size([4, 9])


In [29]:
loader_test_user = DataLoader(
    dataset_test,
    batch_size=4,
    shuffle=False,
    collate_fn=collateUser,
)

batch_user = next(iter(loader_test_user))

print("\n=== USER-ONLY BATCH (collateUser) ===")
for k, v in batch_user['user'].items():
    print(f"{k:12s} ->", v.shape)


=== USER-ONLY BATCH (collateUser) ===
user_statistics -> torch.Size([4, 25])
movies       -> torch.Size([4, 20])
ratings      -> torch.Size([4, 20])
times        -> torch.Size([4, 20])


# ARCHITEKTURA TWO TOWER

In [30]:
EMB_DIM = 64

class UserTower(nn.Module):
    def __init__(self, input_dim, n_items, embedding_dim=EMB_DIM):
        '''
        input_dim - the number of columns in user features, without sequence columns
        '''
        super().__init__()

        self.item_emb = nn.Embedding(n_items, embedding_dim)

        # A layer to project rating and timestamp into a scalar weight
        self.rating_proj = nn.Linear(2, 1)

        self.mlp = nn.Sequential(
            nn.Linear(input_dim + embedding_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 384),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )

    def forward(self, batch):
        # Embed movieIds liked by user
        m = self.item_emb(batch['movies'])

        # Get weights
        x = torch.stack([batch['ratings'], batch['times']], dim=-1) # [B, L_u, 2]
        w = torch.sigmoid(self.rating_proj(x))

        # weighted mean-pool
        pooled = (m * w).sum(1) / w.sum(1).clamp_min(1e-6)   # [B, D]

        input = torch.cat([batch['user_statistics'], pooled], dim=-1) # [B, stats+EMB_DIM]
        output = self.mlp(input)                                    # [B, EMB_DIM]
        u = F.normalize(output, dim = 1)
        return u


class ItemTower(nn.Module):
    def __init__(self,dense_feat_dim,text_emb_dim,vocab_sizes,embedding_dim=EMB_DIM):
        '''
        vocab_sizes - tuple odpowiednio n_actors, n_directors, n_genres
        dense_feat_dim – wymiary numeric+binary+decades+text
        tex_emb_dim - Wektor o wielkosc 300 opisujacy dane tekstowe filmu
        '''
        super().__init__()

        self.actor_emb = nn.Embedding(vocab_sizes[0], embedding_dim)
        self.director_emb = nn.Embedding(vocab_sizes[1], embedding_dim)
        self.genre_emb = nn.Embedding(vocab_sizes[2], embedding_dim)

        self.meta_mlp = nn.Sequential(
            nn.Linear(dense_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, embedding_dim),
            nn.ReLU()
        )

        self.text_mlp = nn.Sequential( #--- to consider za ostre zejscie z 512 -> 64, moze posredni 256
            nn.Linear(text_emb_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, embedding_dim),
            nn.ReLU()
        )

        MLP_INPUT_DIM = embedding_dim*5 # odpowiednio nn.Embeedings * 3 oraz meta_mlp oraz text_mlp
        self.final_mlp = nn.Sequential(
            nn.Linear(MLP_INPUT_DIM, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512,256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256,embedding_dim)
        )

    def forward(self, batch, key: str = "pos_item"):

        dense_feats = batch[key]['dense_features']     # [B, dense_feat_dim]
        text_emb = batch[key]['text_embedding']     # [B, text_emb_dim]

        actor_ids = batch[key]['actor_ids']         # [B, max_len_a]
        director_ids = batch[key]['director_ids']
        genre_ids = batch[key]['genre_ids']

        dense_vec = self.meta_mlp(dense_feats)      # [B, D]
        text_vec = self.text_mlp(text_emb)          # [B, D]

        cast_imp = dense_feats[:, 2:3]              # [B, 1]
        director_score = dense_feats[:, 3:4]        # [B, 1]

        a = self.actor_emb   (actor_ids).mean(dim=1)    # [B, D]
        d = self.director_emb(director_ids).mean(dim=1) # [B, D]
        g = self.genre_emb   (genre_ids).mean(dim=1)    # [B, D]

        # We add weights based on importance score
        a = a * cast_imp
        d = d * director_score #--- do rozwazenia Max pooling lub Attention pooling

        input = torch.cat([a, d, g, dense_vec, text_vec], dim=-1)   # [B, 5D]
        output = self.final_mlp(input)                              # [B, D]
        i = F.normalize(output, dim=1)
        return i


In [31]:
class TwoTowerModel(nn.Module):
    def __init__(self, stats_dim, n_items, vocab_sizes,
                 dense_feat_dim, text_emb_dim, embedding_dim=EMB_DIM):
        super().__init__()
        self.user_tower = UserTower(stats_dim, n_items, embedding_dim)
        self.item_tower = ItemTower(dense_feat_dim, text_emb_dim, vocab_sizes, embedding_dim)

    def forward(self, batch):
        u = self.user_tower(batch['user'])
        i_pos = self.item_tower(batch, key="pos_item")
        i_neg = self.item_tower(batch, key="neg_item")

        return u, i_pos, i_neg # każdy [B, 64]


In [32]:
'''
TEST ARCHITEKTURY MODELOW
'''
device = torch.device("cuda")
model_test  = TwoTowerModel(stats_dim=25,
                       n_items=n_items,
                       vocab_sizes=(num_actors, num_directors, num_genres),
                       dense_feat_dim=24,
                       text_emb_dim=300,
                       embedding_dim=64).to(device)

# First batch
batch_test_2 = next(iter(loader_test_full))

batch_test_2 = {
  'user':      {k: v.to(device, non_blocking=True) for k,v in batch_test_2['user'].items()},
  'pos_item':  {k: v.to(device, non_blocking=True) for k,v in batch_test_2['pos_item'].items()},
  'neg_item':  {k: v.to(device, non_blocking=True) for k,v in batch_test_2['neg_item'].items()},
}

# Forward pass
u_test, i_pos_test, i_neg_test = model_test(batch_test_2)

print("u.shape:",     u_test.shape)      # -> [B, 64]
print("i_pos.shape:", i_pos_test.shape)  # -> [B, 64]
print("i_neg.shape:", i_neg_test.shape)  # -> [B, 64]


u.shape: torch.Size([4, 64])
i_pos.shape: torch.Size([4, 64])
i_neg.shape: torch.Size([4, 64])


# TRENOWANIE

In [33]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.mps.is_available():
    device = torch.device('mps')
print('Device:', device)

Device: cuda


In [34]:
def to_device(data, device):
    if isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif torch.is_tensor(data):
        return data.to(device)
    else:
        return data

In [35]:
'''
Definicja loss-u BPR (Bayesian Personalized Ranking)
'''
def bpr_loss(u, i_pos, i_neg):
    pos = (u*i_pos).sum(1) # [B] score pozytywnych par
    neg = (u*i_neg).sum(1)
    return -torch.log(torch.sigmoid(pos-neg) + 1e-8).mean()

In [36]:
'''
Trenowanie jednej epoki, dodano odpowiednie inputy tez do testow i ewentualnych zmian

Obecnie:
- model: TwoTowerModel
- loader: DataLoader
- optimizer: Adam
- loss: bpr_loss
'''
def train_one_epoch(model, loader, optimizer):
    model.train()
    running_loss = 0.0

    for raw in tqdm(train_loader, desc=f" Epoch {epoch} batches", leave=False):
        batch = to_device(raw, device)
        optimizer.zero_grad()

        user_vec, pos_vec, neg_vec = model(batch) # forward do TwoTowerModel

        loss = bpr_loss(user_vec, pos_vec, neg_vec)

        loss.backward() # Backword i updatujemy parametry
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss/len(loader) # Do wyliczania sredniej straty w epoce
    return epoch_loss

In [37]:
'''
Lekka ewaluacja majaca za zadanie pokazac czy model sie uczy, niz odpowiadac jak dobrze tworzy ranking
'''
def light_evaluate(model, loader):
    model.eval()
    aucs, paac = [], []

    with torch.no_grad():
        for raw in loader:
            batch = to_device(raw, device)

            user_vec, pos_vec, neg_vec = model(batch)

            pos_score = (user_vec * pos_vec).sum(dim = -1) # [B]
            neg_score = (user_vec * neg_vec).sum(dim = -1)

            # ROC AUC
            labels = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            scores = torch.cat([pos_score, neg_score])
            aucs.append(roc_auc_score(labels.cpu(), scores.cpu()))

            # Pair-wise accuarcy
            paac.append((pos_score > neg_score).float().mean().item())

    return float(np.mean(aucs)), float(np.mean(paac))

In [38]:
'''
Dokladniejsza ewaluacja majaca odpowiedziec jak model radzi sobie z rankingiem dla danych uzytkownikow
'''
def heavy_evaluate(model,user_loader,item_embs_np,
                        train_pos_sets,test_pos,top_N=10):
    model.eval()
    user_embs = []

    with torch.no_grad():
        for raw in user_loader:
            batch = to_device(raw, device)

            u, _, _ = model(batch)  # Skupiamy sie tylko na zebraniu embeddingow uzytkownika

            user_embs.append(u.cpu().numpy())

    user_embs = np.vstack(user_embs)    # [U-liczba uzytkownikow, D]

    assert len(all_user_ids) == user_embs.shape[0]
    recalls, mrrs = [], []

    for idx, user_id in enumerate(all_user_ids):
        vec = user_embs[idx]                # [D] wektor emb usera
        scores = item_embs_np @ vec         # [I] wektory score, do oceny czy to dziala poprawnie ?

        mask = np.zeros_like(scores, dtype=bool)
        mask[list(train_pos_sets[user_id])] = True  # Tworzymy maske do odsiania filmow ktore user juz widzial
        scores[mask] = -1e9

        ranked = np.argsort(-scores)[:top_N]    # Ranking
        true_set = test_pos[user_id]            # hold-out

        # Recall@K
        recalls.append(int(any(r in true_set for r in ranked)))

        # MRR@K
        rr = 0.0
        for rank, idx in enumerate(ranked, 1):
            if idx in true_set:
                rr = 1.0/rank
                break
        mrrs.append(rr)

    return float(np.mean(recalls)), float(np.mean(mrrs))

In [39]:
EPOCHS = 50
model = (TwoTowerModel(stats_dim=25,
                       n_items=n_items,
                       vocab_sizes=(num_actors, num_directors, num_genres),
                       dense_feat_dim=24,
                       text_emb_dim=300,
                       embedding_dim=EMB_DIM)
         .to(device))
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS) # zmieniamy LR zgodnie z kosinusem (powinno stabilizowac trening)

for epoch in trange(1, EPOCHS+1, desc="Epochs"):

    tr_loss = train_one_epoch(model, train_loader, optimizer) # Logika treningu

    scheduler.step() # optymalizacja LR

    if epoch % 5 == 0:
        auc, pair_acc = light_evaluate(model, val_loader, device)
        print(f"Epoch {epoch:2d} | train_loss={tr_loss:.4f} | "f"val ROC-AUC={auc:.4f} | pair-acc={pair_acc:.4f}")
    else:
        print(f"Epoch {epoch:2d} | train_loss={tr_loss:.4f}")

Epochs:   0%|          | 0/50 [00:00<?, ?it/s]

 Epoch 1 batches:   0%|          | 0/39767 [00:00<?, ?it/s]

KeyError: 196885