In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import TensorDataset

from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

from datetime import datetime
from tqdm import tqdm

import random
from pathlib import Path

from sklearn.model_selection import train_test_split
from itertools import chain

from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm, trange

import math
import faiss

In [None]:
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'                # Tylko do debugging

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device(), torch.cuda.get_device_name(0))

In [None]:
BASE_DIR = Path(os.getcwd()).parent.parent
DATA_DIR = BASE_DIR / "data"

df_users = pd.read_parquet(DATA_DIR / 'user_features_clean_warm.parquet')

df_movies = pd.read_parquet(DATA_DIR / 'Movies_clean_Vec_v4_25keywords.parquet')

df_ratings = pd.read_parquet(DATA_DIR / 'ratings_groupped_20pos.parquet')

df_LOOCV = pd.read_parquet(DATA_DIR / 'ratings_LOOCV.parquet')

In [None]:
df_movies.info()
df_ratings.info()
df_users.info()
df_LOOCV.info()

In [None]:
print("=== DEBUGGING LOOCV vs USERS ===")
print(f"df_users shape: {df_users.shape}")
print(f"df_LOOCV shape: {df_LOOCV.shape}")
print(f"df_users userId count: {df_users['userId'].nunique()}")
print(f"df_LOOCV userId count: {df_LOOCV['userId'].nunique()}")

users_set = set(df_users['userId'])
loocv_set = set(df_LOOCV['userId'])

print(f"Same users? {users_set == loocv_set}")
print(f"Users not in LOOCV: {len(users_set - loocv_set)}")
print(f"LOOCV not in users: {len(loocv_set - users_set)}")

if len(users_set) == len(loocv_set) and users_set == loocv_set:
    print("df_LOOCV contains all users from df_users")
else:
    print("df_LOOCV is subset/different from df_users")


# Sprawdzenie pokrycia movieId

In [None]:
user_ids = set(df_users['userId'])
ratings_user_ids = set(df_ratings['userId'])

In [None]:
print(f"Users w ratings: {len(user_ids & ratings_user_ids):,}/{len(user_ids):,}")

In [None]:
mids_pos = set(x for lst in df_ratings['pos'] for x in lst)
mids_seen = set(x for lst in df_ratings['seen'] for x in lst)
all_rated_movies = mids_pos | mids_seen
available_movies = set(df_movies['movieId'])
missing_movies = all_rated_movies - available_movies

In [None]:
print(f"Pokrycie filmów:")
print(f"Filmy w pos ratings: {len(mids_pos):,}")
print(f"Filmy w seen ratings: {len(mids_seen):,}")
print(f"Brakujące filmy w df_movies: {len(missing_movies):,}")

In [None]:
import pandas as pd

pos_user_counts = {
    m: df_ratings['pos'].map(lambda lst: m in lst).sum()
    for m in missing_movies
}
seen_user_counts = {
    m: df_ratings['seen'].map(lambda lst: m in lst).sum()
    for m in missing_movies
}

df_missing_stats = (
    pd.DataFrame({
        'pos_users': pos_user_counts,
        'seen_users': seen_user_counts,
    })
    .sort_values(['pos_users','seen_users'], ascending=False)
)
print(df_missing_stats)

In [11]:
valid_ids = set(df_movies['movieId'])
df_ratings['pos'] = df_ratings['pos'].apply(lambda lst: [m for m in lst if m in valid_ids])
df_ratings['seen'] = df_ratings['seen'].apply(lambda lst: [m for m in lst if m in valid_ids])

df_ratings = df_ratings[df_ratings['pos'].map(len).gt(0) & df_ratings['seen'].map(len).gt(0)]

In [12]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157023 entries, 0 to 157022
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  157023 non-null  int64 
 1   seen    157023 non-null  object
 2   pos     157023 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.6+ MB


# Przygotowanie movieId dla datasetów

In [13]:
'''
Sanity check ratingow (powinno byc 19, poniewaz jeden w LOOCV)
'''
single_pos_users = (df_ratings['pos'].apply(len) < 19).sum()

print(f"Liczba użytkowników z mniej niz 19 pozytywnymi ratingami: {single_pos_users}")

Liczba użytkowników z mniej niz 19 pozytywnymi ratingami: 0


In [14]:
empty_pos_ratings = df_ratings['pos'].apply(lambda x: len(x) == 0).sum()
empty_seen_ratings = df_ratings['seen'].apply(lambda x: len(x) == 0).sum()

if empty_pos_ratings != 0 or empty_seen_ratings != 0:
    print(f'Empty ratings: pos: {empty_pos_ratings}, seen: {empty_seen_ratings}')
    raise Exception("Users without a single pos/neg rating exist in the ratings_groupped_ids dataset")

In [15]:
unique_ids = set(
        df_users['movies_seq'].explode().tolist()
        + df_ratings['pos'].explode().tolist() 
        + df_ratings['seen'].explode().tolist()
        + df_LOOCV['holdout_movieId'].tolist()
    )

print('Unique movieIds:', len(unique_ids))
unique_ids = sorted(unique_ids)

movieId_to_idx = {id_: idx for idx, id_ in enumerate(unique_ids)}
print('min idx:', min(movieId_to_idx.values()))
print('max idx:', max(movieId_to_idx.values()))

n_items = len(unique_ids)

assert min(movieId_to_idx.values()) == 0
assert max(movieId_to_idx.values()) == n_items - 1

Unique movieIds: 84133
min idx: 0
max idx: 84132


In [16]:
df_users['movies_seq'] = df_users['movies_seq'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings['pos'] = df_ratings['pos'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings['seen'] = df_ratings['seen'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings = df_ratings.set_index('userId')

df_movies = df_movies[df_movies['movieId'].isin(movieId_to_idx)].copy()
df_movies['movieId'] = df_movies['movieId'].map(movieId_to_idx)
df_movies = df_movies.set_index('movieId')

df_LOOCV['holdout_movieId'] = df_LOOCV['holdout_movieId'].map(movieId_to_idx)

assert df_users['movies_seq'].explode().max() < n_items
assert df_ratings['pos'].explode().max() < n_items
assert df_ratings['seen'].explode().max() < n_items

assert df_movies.index.max() < n_items
assert df_movies.index.notna().all()

assert df_LOOCV['holdout_movieId'].notna().all()

In [17]:
max_movie_idx = df_users['movies_seq'].explode().max()
print("max_movie_idx =", max_movie_idx)
print("n_items =", n_items)

assert max_movie_idx < n_items, "Indeks filmu przekracza rozmiar embeddingu"

max_movie_idx = 84132
n_items = 84133


In [18]:
def has_invalid_entries(seq_col):
    return seq_col.explode().isin([-1, np.nan, None]).any()

print("Zawiera niepoprawne wartości (train):", has_invalid_entries(df_users['movies_seq']))

Zawiera niepoprawne wartości (train): False


In [19]:
df_users.info()
df_users.head(100)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157023 entries, 0 to 157022
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   userId                   157023 non-null  int64  
 1   num_rating               157023 non-null  float64
 2   avg_rating               157023 non-null  float64
 3   weekend_watcher          157023 non-null  float64
 4   genre_Action             157023 non-null  float64
 5   genre_Adventure          157023 non-null  float64
 6   genre_Animation          157023 non-null  float64
 7   genre_Comedy             157023 non-null  float64
 8   genre_Crime              157023 non-null  float64
 9   genre_Documentary        157023 non-null  float64
 10  genre_Drama              157023 non-null  float64
 11  genre_Family             157023 non-null  float64
 12  genre_Fantasy            157023 non-null  float64
 13  genre_History            157023 non-null  float64
 14  genr

Unnamed: 0,userId,num_rating,avg_rating,weekend_watcher,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,...,genre_TV Movie,genre_Thriller,genre_War,genre_Western,type_of_viewer_negative,type_of_viewer_neutral,type_of_viewer_positive,movies_seq,ratings_seq,ts_seq
0,1,-0.174005,-0.542931,0.0,0.839846,-0.550914,-0.300797,-0.598723,0.840804,-1.201041,...,-0.429247,-0.177017,-0.582586,-1.035140,0.0,1.0,0.0,"[1230, 1262, 905, 2156, 220, 1122, 229, 2790, ...","[5.0, 3.0, 5.0, 5.0, 3.0, 4.0, 5.0, 1.0, 1.0, ...","[0.7177751660346985, 0.7178045511245728, 0.717..."
1,2,-0.461647,1.083367,0.0,0.612983,0.852558,1.556680,0.872001,0.310369,0.646536,...,0.874475,1.018953,-1.428353,0.693547,0.0,0.0,1.0,"[33, 587, 184, 183, 273, 546, 292, 359, 545, 5...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 1.0, 5.0, 4.0, ...","[0.717822253704071, 0.7178265452384949, 0.7178..."
2,3,-0.154613,-0.418264,0.0,-0.095961,-0.079469,0.293370,-0.883915,-0.903511,-0.344475,...,-0.329308,-0.854894,0.094026,-0.013643,0.0,1.0,0.0,"[1339, 546, 2609, 352, 362, 1063, 1868, 1655, ...","[4.0, 4.0, 3.0, 4.0, 2.5, 4.0, 5.0, 4.0, 3.0, ...","[0.717746376991272, 0.7178027629852295, 0.7178..."
3,7,-0.487502,-0.312549,1.0,-1.088483,-0.695389,-0.344935,-0.167619,-0.102191,1.710292,...,-0.244561,-0.234240,-0.459566,-0.354142,0.0,1.0,0.0,"[584, 160, 163, 292, 580, 334, 18, 578, 429, 5...","[3.0, 5.0, 3.0, 5.0, 4.0, 2.0, 3.0, 3.0, 3.0, ...","[0.7177982330322266, 0.7178294658660889, 0.717..."
4,8,-0.529518,1.201041,0.0,1.251033,1.038312,0.792688,0.629423,1.006565,0.724196,...,0.968809,1.396627,1.108946,0.766209,0.0,0.0,1.0,"[9337, 7236, 9424, 9431, 4122, 46, 10703, 3907...","[5.0, 4.5, 4.0, 5.0, 4.5, 5.0, 4.5, 3.5, 4.0, ...","[-1.1275984048843384, -1.1275787353515625, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,114,-0.164309,1.048608,0.0,0.993575,1.344881,0.922808,0.914387,0.361939,0.623597,...,0.846611,0.924534,0.550740,0.672084,0.0,0.0,1.0,"[3479, 20094, 7872, 32708, 15529, 20628, 8270,...","[5.0, 3.5, 4.0, 4.0, 5.0, 4.5, 4.0, 4.5, 4.0, ...","[-0.6521849632263184, -0.6399586200714111, -0...."
96,115,0.931316,-0.005959,1.0,0.542089,0.563933,0.749934,-0.101945,-0.205331,-0.837124,...,0.001217,0.261052,-0.023080,0.054457,0.0,1.0,0.0,"[4452, 5310, 4638, 2514, 6259, 1092, 5215, 490...","[3.5, 4.5, 3.0, 4.0, 5.0, 3.0, 2.0, 5.0, 4.0, ...","[0.7177687883377075, 0.717777669429779, 0.7177..."
97,116,-0.377617,-0.203292,0.0,-0.055450,-0.353212,1.729554,-0.300459,-0.357979,-0.202603,...,-0.156975,-0.291463,0.474621,-0.100951,0.0,1.0,0.0,"[6247, 3601, 3873, 1834, 1583, 2620, 8270, 106...","[2.0, 4.0, 0.5, 4.5, 4.0, 4.0, 2.0, 3.0, 3.0, ...","[0.7178043723106384, 0.7178138494491577, 0.717..."
98,118,-0.035032,0.573437,0.0,0.404313,0.659065,0.593525,0.846395,0.790505,0.982459,...,0.465690,0.188069,0.232424,0.326856,0.0,0.0,1.0,"[2305, 578, 1237, 1154, 1175, 10027, 6684, 556...","[4.0, 3.5, 4.0, 4.0, 4.5, 3.5, 5.0, 4.5, 4.5, ...","[0.7177798748016357, 0.7177939414978027, 0.717..."


In [20]:
df_ratings.info()
df_ratings.head(100)

<class 'pandas.core.frame.DataFrame'>
Index: 157023 entries, 1 to 200948
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   seen    157023 non-null  object
 1   pos     157023 non-null  object
dtypes: object(2)
memory usage: 3.6+ MB


Unnamed: 0_level_0,seen,pos
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[2985, 522, 1975, 1018, 2995, 1978, 16, 24, 53...","[1230, 905, 2156, 1122, 229, 536, 1879, 318, 2..."
2,"[767, 273, 151, 279, 534, 30, 33, 545, 546, 29...","[33, 587, 184, 183, 273, 546, 359, 545, 586, 3..."
3,"[1, 1008, 1489, 9, 10, 522, 16, 529, 25, 534, ...","[1339, 546, 352, 1063, 1868, 1655, 840, 1489, ..."
7,"[262, 18, 526, 20, 148, 151, 405, 534, 285, 16...","[160, 292, 580, 503, 206, 582, 148, 314, 344, ..."
8,"[5904, 4122, 257, 10703, 11932, 2867, 891, 522...","[9337, 7236, 9424, 9431, 4122, 46, 10703, 31, ..."
...,...,...
114,"[22467, 24703, 13380, 24705, 21481, 21663, 174...","[3479, 7872, 32708, 15529, 20628, 8270, 14503,..."
115,"[0, 1, 2, 1964, 5, 1965, 6044, 6045, 14, 15, 1...","[5310, 2514, 6259, 4904, 512, 1070, 359, 345, ..."
116,"[2480, 18, 2489, 31, 43, 49, 6585, 1053, 1062,...","[3601, 1834, 1583, 2620, 2480, 49, 2451, 8084,..."
118,"[11732, 0, 10003, 5, 1489, 9, 2480, 1013, 1070...","[2305, 1237, 1154, 1175, 6684, 5568, 2766, 243..."


In [21]:
df_LOOCV.info()
df_LOOCV.head(100)

<class 'pandas.core.frame.DataFrame'>
Index: 157023 entries, 59 to 30594056
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   userId           157023 non-null  int64
 1   holdout_movieId  157023 non-null  int64
dtypes: int64(2)
memory usage: 3.6 MB


Unnamed: 0,userId,holdout_movieId
59,1,2222
151,2,515
282,3,351
354,7,536
414,8,840
...,...,...
16727,114,2480
17244,115,1727
17270,116,3524
17364,118,4873


In [22]:
df_movies.info()
df_movies.head(100)

<class 'pandas.core.frame.DataFrame'>
Index: 84133 entries, 0 to 84132
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   runtime              84133 non-null  float64
 1   if_blockbuster       84133 non-null  int32  
 2   highly_watched       84133 non-null  int32  
 3   highly_rated         84133 non-null  int64  
 4   engagement_score     84133 non-null  float64
 5   cast_importance      84133 non-null  float64
 6   director_score       84133 non-null  float64
 7   has_keywords         84133 non-null  int64  
 8   has_cast             84133 non-null  int64  
 9   has_director         84133 non-null  int64  
 10  genre_ids            84133 non-null  object 
 11  decade_[1890, 1900)  84133 non-null  bool   
 12  decade_[1900, 1910)  84133 non-null  bool   
 13  decade_[1910, 1920)  84133 non-null  bool   
 14  decade_[1920, 1930)  84133 non-null  bool   
 15  decade_[1930, 1940)  84133 non-null  bool

Unnamed: 0_level_0,runtime,if_blockbuster,highly_watched,highly_rated,engagement_score,cast_importance,director_score,has_keywords,has_cast,has_director,...,"decade_[1960, 1970)","decade_[1970, 1980)","decade_[1980, 1990)","decade_[1990, 2000)","decade_[2000, 2010)","decade_[2010, 2020)","decade_[2020, 2030)",text_embedded,actor_ids,director_ids
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.858910,1,1,1,3.103444,2.025482,1.997245,1,1,1,...,False,False,False,True,False,False,False,"[0.06548511, 0.16055259, 0.020576902, -0.06513...","[10748, 10645, 2688, 11192, 702]",[2399]
1,0.621099,1,1,1,2.211625,2.002399,1.904255,1,1,1,...,False,False,False,True,False,False,False,"[0.029812882, 0.17041773, -0.0033990666, -0.01...","[9365, 6213, 1282, 8457]",[2307]
2,0.496390,0,1,0,1.348532,1.117447,1.472204,1,1,1,...,False,False,False,True,False,False,False,"[0.06587183, 0.14877164, 0.0310863, -0.0848565...","[6110, 2354, 4464, 10134, 11212]",[1898]
3,1.471555,0,1,0,1.136677,1.481019,0.861536,1,1,1,...,False,False,False,True,False,False,False,"[0.020277813, 0.116846524, -0.031359904, -0.06...","[2537, 583, 7940, 6713]",[1439]
4,0.721714,0,1,0,1.470989,1.896445,1.355849,1,1,1,...,False,False,False,True,False,False,False,"[0.002441175, 0.1875583, 0.0040396256, -0.0971...","[10259, 958, 7253, 2590, 6128]",[734]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.320290,0,1,1,1.923948,1.107756,1.292021,1,1,1,...,False,False,False,True,False,False,False,"[-0.026540253, 0.10090012, 0.025940355, -0.014...","[11086, 9792]",[3212]
96,0.673702,0,0,0,0.463934,1.459025,2.075617,1,1,1,...,False,False,False,True,False,False,False,"[0.0429306, 0.09379077, 0.059965726, -0.056168...","[5613, 9822, 5486, 8920, 9838]",[3749]
97,0.721714,0,0,0,-0.281158,-0.361217,0.072925,1,1,1,...,False,False,False,True,False,False,False,"[0.00065221224, 0.09104641, -0.036274366, 0.02...",[2136],[3528]
98,0.945616,0,1,0,1.242669,1.764055,1.346386,1,1,1,...,False,False,False,True,False,False,False,"[-0.0010480286, 0.13111763, 0.09517799, -0.003...","[167, 5302, 1389, 2436, 6640]",[1801]


In [23]:
'''
Do szybkich testow z mniejsza iloscia danych
'''
DEBUG = False

if DEBUG:
    sampled_users = df_users.sample(n=10000, random_state=213).copy()

    mask = df_ratings.index.isin(sampled_users['userId'])
    sampled_ratings = df_ratings[mask].copy()

    mask_loocv = df_LOOCV['userId'].isin(sampled_users['userId'])
    sampled_loocv = df_LOOCV[mask_loocv].copy()

    # used_movie_ids = set(sampled_users['movies_seq'].explode()) \
    #                | set(sampled_ratings['pos'].explode()) \
    #                | set(sampled_ratings['seen'].explode()) \
    #                | set(sampled_loocv['holdout_movieId'])
    # sampled_movies = df_movies[df_movies.index.isin(used_movie_ids)].copy()

    df_users = sampled_users
    df_ratings = sampled_ratings
    df_LOOCV = sampled_loocv
    # df_movies = sampled_movies

In [24]:
df_movies.info()
df_ratings.info()
df_users.info()
df_LOOCV.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84133 entries, 0 to 84132
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   runtime              84133 non-null  float64
 1   if_blockbuster       84133 non-null  int32  
 2   highly_watched       84133 non-null  int32  
 3   highly_rated         84133 non-null  int64  
 4   engagement_score     84133 non-null  float64
 5   cast_importance      84133 non-null  float64
 6   director_score       84133 non-null  float64
 7   has_keywords         84133 non-null  int64  
 8   has_cast             84133 non-null  int64  
 9   has_director         84133 non-null  int64  
 10  genre_ids            84133 non-null  object 
 11  decade_[1890, 1900)  84133 non-null  bool   
 12  decade_[1900, 1910)  84133 non-null  bool   
 13  decade_[1910, 1920)  84133 non-null  bool   
 14  decade_[1920, 1930)  84133 non-null  bool   
 15  decade_[1930, 1940)  84133 non-null  bool

# Przygotowanie danych do uczenia

## Wstepne przetwarzanie danych
Przygotowanie globalnych statystyk i collectorow

In [25]:
'''
Globalny max_len
'''
max_len_a = int(df_movies['actor_ids'].str.len().max())
max_len_d = int(df_movies['director_ids'].str.len().max())
max_len_g = int(df_movies['genre_ids'].str.len().max())

In [26]:
'''
Dla nn.Embeedings -> Item Tower
'''
all_actor_ids = list(chain.from_iterable(df_movies['actor_ids']))
num_actors = max(all_actor_ids) + 1

all_director_ids = list(chain.from_iterable(df_movies['director_ids']))
num_directors = max(all_director_ids) + 1

all_genre_ids = list(chain.from_iterable(df_movies['genre_ids']))
num_genres = max(all_genre_ids) + 1

print(num_actors, num_directors, num_genres)

11606 5240 20


In [27]:
def collect_user_features(u):
        """
        Zwraca cztery tensory: movies_seq, ratings_seq, ts_seq, user_stats
        """
        movies_seq  = torch.tensor(u['movies_seq'], dtype=torch.long)
        ratings_seq = torch.tensor(u['ratings_seq'], dtype=torch.float32)
        ts_seq      = torch.tensor(u['ts_seq'], dtype=torch.float32)
       
        stats_cols  = [c for c in u.index if c.startswith(('num_rating','avg_rating','weekend_watcher','genre_','type_of_viewer_'))]
        user_stats  = torch.tensor(u[stats_cols]
                                        .astype('float32').values,dtype=torch.float32)

        return movies_seq, ratings_seq, ts_seq, user_stats

In [28]:
def collect_movie_features(m, max_len_a, max_len_d, max_len_g):
        """
        Zwraca cztery tensory: combined, actor_ids, director_ids, genre_ids
        """
        numeric = [
            m.runtime,
            m.engagement_score,
            m.cast_importance,
            m.director_score,
        ]
        binary = [
            m.if_blockbuster,
            m.highly_watched,
            m.highly_rated,
            m.has_keywords,
            m.has_cast,
            m.has_director,
        ]
        decades = (m[[c for c in m.index if c.startswith('decade_')]]
                   .astype(int)
                   .tolist())

        dense_feats = torch.tensor(numeric + binary + decades, dtype=torch.float32)
        text_emb = torch.tensor(m.text_embedded, dtype=torch.float32)

        def pad(seq, L):
            seq_list = list(seq) if not isinstance(seq, list) else seq
            padded = seq_list[:L] + [0] * max(0, L - len(seq_list))
            return torch.tensor(padded, dtype=torch.long)

        actor_ids    = pad(m.actor_ids,    max_len_a)
        director_ids = pad(m.director_ids, max_len_d)
        genre_ids    = pad(m.genre_ids,    max_len_g)

        return dense_feats, text_emb, actor_ids, director_ids, genre_ids

## Przygotowanie negatywow i hard_neg z FAISS
Pre-compute poprzez przygotowanie pool-i negatywow dla userow

In [29]:
def build_faiss_index_for_movies(df_movies):
    '''
    Do poczatkowego zbudowania macierzy embeedingow dla FAISS, do szukania najblizszych sasiadow
    '''
    movie_vecs = []
    movie_ids = []

    for i, m_id in enumerate(df_movies.index):
        try:
            dense_feats, text_emb, *_ = collect_movie_features(
                df_movies.loc[m_id],
                max_len_a, max_len_d, max_len_g
            )
            combined = torch.cat([dense_feats, text_emb], dim=0)
            # normalizujemy L2 na potrzeby FAISS cosinusowego (wyplaszczanie)
            normalized_vec = F.normalize(combined, dim=0)
            movie_vecs.append(normalized_vec)
            movie_ids.append(m_id)

            if (i + 1) % 10000 == 0:
                    print(f" - Przetworzono {i + 1}/{len(df_movies)} filmów")

        except Exception as e:
            print(f" Blad przy przetwarzaniu filmu {m_id}: {e}")
            continue

    movie_matrix = torch.stack(movie_vecs)          # macierz [n_movies, D]
    movie_matrix_np = movie_matrix.cpu().numpy().astype('float32')

    print(f"Macierz filmow: {movie_matrix_np.shape}")

    # FAISS IP po L2-normalizacji = cosine similarity
    faiss_index = faiss.IndexFlatIP(movie_matrix_np.shape[1])
    faiss_index.add(movie_matrix_np)

    local_to_movie = {i: movie_id for i, movie_id in enumerate(movie_ids)}
    movie_to_local = {movie_id: i for i, movie_id in enumerate(movie_ids)}

    print(f" - Liczba filmów: {faiss_index.ntotal:,}")
    print(f" - Wymiar wektora: {movie_matrix_np.shape[1]}")
    print(f" - Typ index: IndexFlatIP (cosine similarity)")

    return faiss_index, movie_matrix_np, local_to_movie, movie_to_local

initial_faiss_index, initial_movie_matrix_np, initial_local_to_movie, initial_movie_to_local = build_faiss_index_for_movies(df_movies)

 - Przetworzono 10000/84133 filmów
 - Przetworzono 20000/84133 filmów
 - Przetworzono 30000/84133 filmów
 - Przetworzono 40000/84133 filmów
 - Przetworzono 50000/84133 filmów
 - Przetworzono 60000/84133 filmów
 - Przetworzono 70000/84133 filmów
 - Przetworzono 80000/84133 filmów
Macierz filmow: (84133, 324)
 - Liczba filmów: 84,133
 - Wymiar wektora: 324
 - Typ index: IndexFlatIP (cosine similarity)


In [30]:
class MovieSegmentation:
    def __init__(self, df_movies):
        self.df_movies = df_movies
        self.segments = self._create_segments()
        self._print_segment_stats()

    def _create_segments(self):
        df = self.df_movies.copy()

        segments = {
            'blockbuster':      df[df['if_blockbuster'] == 1].index.tolist(),                                           # 2,280
            'highly_watched':   df[df['highly_watched'] == 1].index.tolist(),                                           # 7,568
            'highly_rated':     df[df['highly_rated'] == 1].index.tolist(),                                             # 8,261
            'mainstream':       df[df['engagement_score'] >= 0.75].index.tolist(),                                      # 19,383
            'niche':            df[(df['engagement_score'] > 0) & (df['engagement_score'] < 0.75)].index.tolist(),      # 22,589
            'obscure':          df[df['engagement_score'] <= 0].index.tolist()                                          # 42,161
        }
        return segments

    def _print_segment_stats(self):
        print("--- STATYSTYKI FILMOW (Overlaps) ---")
        total_movies = len(self.df_movies)
        for segment, movies in self.segments.items():
            percentage = (len(movies) / total_movies) * 100
            print(f"{segment.upper():>15}: {len(movies):>6,} ({percentage:>5.1f}%)")

In [31]:
class NegativeSampler:

    def __init__(self, df_ratings, n_items):
        self.df_ratings = df_ratings
        self.n_items = n_items
        self.all_movie_ids = df_movies.index.to_numpy()

        movie_segmentation = MovieSegmentation(df_movies)
        self.segment_pools = {
            key: np.array(val, dtype=np.int32)
            for key, val in movie_segmentation.segments.items()
        }
        print("Segment pools created.")

        self.regular_user_recipe_pct = {
            'mainstream': 0.40,
            'highly_rated': 0.20,
            'niche': 0.20,
            'blockbuster': 0.10,
            'obscure': 0.10
        }
        assert math.isclose(sum(self.regular_user_recipe_pct.values()), 1.0), "Recipe percentages must sum to 1.0"

        interaction_counts = self.df_ratings['seen'].str.len()
        heavy_user_threshold = interaction_counts.quantile(0.90)
        self.heavy_users = set(interaction_counts[interaction_counts >= heavy_user_threshold].index)
        print(f"Identified {len(self.heavy_users):,} heavy users (>= {int(heavy_user_threshold)} interactions).")

        print("Setup completed.")

    def _sample_prep_negatives(self, user_seen_array, k):
        """
        W oparciu o vektory dla szybkiego samplowania na bazie stratyfikacji
        """
        negatives = set()

        for segment, percentage in self.regular_user_recipe_pct.items():
            pool = self.segment_pools[segment]
            if len(pool) == 0:
                continue

            num_samples = int(round(k * percentage))
            if num_samples == 0:
                continue

            candidate_size = min(num_samples * 5, len(pool))
            candidates = np.random.choice(pool, size=candidate_size, replace=False)

            # Uzywamy np.isin powinno dac szybkie filtrowanie seen items
            mask = np.isin(candidates, user_seen_array, invert=True)
            valid_negs = candidates[mask]
            negatives.update(valid_negs[:num_samples])

        current_k = len(negatives)
        if current_k < k:
            needed = k - current_k
            seen_and_chosen = np.concatenate((user_seen_array, list(negatives)))
            fill_pool = np.setdiff1d(self.all_movie_ids, seen_and_chosen, assume_unique=True)

            if len(fill_pool) > 0:
                negatives.update(np.random.choice(fill_pool, size=min(needed, len(fill_pool)), replace=False))

        return list(negatives)[:k]

    def _sample_hard_negatives(self, pos_id, user_seen_array, k, top_k=200):
        """
        Sampling dla heavy users poprzez FAISS
        - k_h = int(k * hard_frac) zwraca jaka liczbe hard_neg dostarczamy
        """
        k_h = int(k * 0.5)          # Liczba hard negatywów

        hard_negs = np.array([], dtype=np.int32)

        if k_h > 0 and pos_id in movie_to_local:
            try:
                local_pos = movie_to_local[pos_id]

                _, I = faiss_index.search(movie_matrix_np[local_pos].reshape(1, -1), top_k)

                hard_cand_mask = np.isin(I[0], user_seen_array, invert=True)
                hard_cands = I[0][hard_cand_mask]

                if len(hard_cands) > 0:
                    num_to_sample = min(k_h, len(hard_cands))
                    hard_negs = np.random.choice(hard_cands, size=num_to_sample, replace=False)

            except Exception as e:
                 print(f"Blad hard negative sampling dla filmu {pos_id}: {e}")

        needed_random = k - len(hard_negs)
        if needed_random > 0:
            seen_and_hard = np.concatenate((user_seen_array, hard_negs))
            random_pool = np.setdiff1d(self.all_movie_ids, seen_and_hard, assume_unique=True)

            if len(random_pool) > 0:
                num_to_sample = min(needed_random, len(random_pool))
                random_negs = np.random.choice(random_pool, size=num_to_sample, replace=False)
                return np.concatenate((hard_negs, random_negs)).tolist()

        return hard_negs.tolist()

    def sample(self, user_id, pos_id, k):
        """
        Poprawnie wybiera metode samplowania wzgledem usera (tylko do tej sie odwolujemy)
        """
        user_seen_array = self.df_ratings.loc[user_id, 'seen']

        if user_id in self.heavy_users:
            return self._sample_hard_negatives(pos_id, user_seen_array, k)
        else:
            return self._sample_prep_negatives(user_seen_array, k)

## Datasety + collate

In [32]:
class MovieDataset(Dataset):
    '''
    Potrzebny do stworzenia matrix-a pod LOOCV
    '''
    def __init__(self, df_movies):
        self.df = df_movies
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        m = self.df.iloc[idx]
        return collect_movie_features(m, max_len_a, max_len_d, max_len_g)

In [33]:
class TwoTowerDataset(Dataset):

    def __init__(self, df_users, df_ratings, df_movies, k_negatives=50):
        self.df_users = df_users.reset_index(drop=True)
        self.df_ratings = df_ratings
        self.df_movies = df_movies
        self.k_negatives = k_negatives

        self.max_len_a = max_len_a
        self.max_len_d = max_len_d
        self.max_len_g = max_len_g

        self.negative_sampler = NegativeSampler(
            df_ratings=df_ratings,
            n_items=len(df_movies),
        )

    def __len__(self):
        return len(self.df_users)

    def __getitem__(self, idx):
        # --- USER FEATURES ---
        u_row = self.df_users.iloc[idx]
        movies_seq, ratings_seq, ts_seq, user_stats = collect_user_features(u_row)
        user_id = u_row['userId']

        user_data = self.df_ratings.loc[user_id]
        pos_list = user_data['pos']
        # seen_set = set(user_data['seen'])

        if not pos_list:
            raise ValueError(f"Użytkownik {user_id} nie ma pozytywnych ratingów!")

        # --- BPR ---
        pos_id = random.choice(pos_list)
        neg_ids = self.negative_sampler.sample(user_id, pos_id, self.k_negatives)

        # --- DEBUG ---
        assert pos_id not in neg_ids,                       f"Wylosowałeś negatyw równy pozytywowi {user_id}!"
        assert len(neg_ids) == self.k_negatives,            f"Zła liczba negatywów {len(neg_ids)} != {self.k_negatives}"
        # assert all(nid not in seen_set for nid in neg_ids), f"Negatyw był już widziany przez użytkownika {user_id}!"

        # --- COLLECT ITEMS ---
        m_pos = self.df_movies.loc[pos_id]
        pos_feats, pos_text, pos_actors, pos_directors, pos_genres = collect_movie_features(m_pos, self.max_len_a, self.max_len_d, self.max_len_g)

        neg_feats_list, neg_text_list, neg_actor_list, neg_director_list, neg_genre_list = [], [], [], [], []
        for nid in neg_ids:
            m_neg = self.df_movies.loc[nid]
            nf, nt, na, nd, ng = collect_movie_features(m_neg, self.max_len_a, self.max_len_d, self.max_len_g)
            neg_feats_list.append(nf)
            neg_text_list.append(nt)
            neg_actor_list.append(na)
            neg_director_list.append(nd)
            neg_genre_list.append(ng)

        return {
            'user': {
                'user_statistics': user_stats,
                'movies': movies_seq,
                'ratings': ratings_seq,
                'times': ts_seq,
            },
            'pos_item': {
                'dense_features': pos_feats,
                'text_embedding': pos_text,
                'actor_ids': pos_actors,
                'director_ids': pos_directors,
                'genre_ids': pos_genres,
            },
            'neg_item': {
                'dense_features':  torch.stack(neg_feats_list),    # [k, dense_feat_dim]
                'text_embedding':  torch.stack(neg_text_list),     # [k, text_emb_dim]
                'actor_ids':       torch.stack(neg_actor_list),    # [k, max_len_a]
                'director_ids':    torch.stack(neg_director_list), # [k, max_len_d]
                'genre_ids':       torch.stack(neg_genre_list),    # [k, max_len_g]
            }
        }

In [34]:
'''
TEST DATASETU I ODPOWIEDNIEGO OUTPUTU POJEDYNCZEGO OBIEKTU GET_ITEM
'''
dataset_test = TwoTowerDataset(df_users, df_ratings, df_movies)

sample0 = dataset_test[0]

print("Keys:", sample0.keys())
print("\n--- USER ---")
for k,v in sample0['user'].items():
    print(f" user[{k}]:", type(v), getattr(v, "shape", v[:5] if isinstance(v,list) else v))

print("\n--- POS ITEM ---")
for k,v in sample0['pos_item'].items():
    print(f" pos_item[{k}]:", type(v), v.shape if hasattr(v,'shape') else v[:5])

print("\n--- NEG ITEM ---")
for k,v in sample0['neg_item'].items():
    print(f" neg_item[{k}]:", type(v), v.shape if hasattr(v,'shape') else v[:5])

--- STATYSTYKI FILMOW (Overlaps) ---
    BLOCKBUSTER:  2,280 (  2.7%)
 HIGHLY_WATCHED:  7,568 (  9.0%)
   HIGHLY_RATED:  8,261 (  9.8%)
     MAINSTREAM: 19,383 ( 23.0%)
          NICHE: 22,589 ( 26.8%)
        OBSCURE: 42,161 ( 50.1%)
Segment pools created.
Identified 15,711 heavy users (>= 430 interactions).
Setup completed.
Keys: dict_keys(['user', 'pos_item', 'neg_item'])

--- USER ---
 user[user_statistics]: <class 'torch.Tensor'> torch.Size([25])
 user[movies]: <class 'torch.Tensor'> torch.Size([19])
 user[ratings]: <class 'torch.Tensor'> torch.Size([19])
 user[times]: <class 'torch.Tensor'> torch.Size([19])

--- POS ITEM ---
 pos_item[dense_features]: <class 'torch.Tensor'> torch.Size([24])
 pos_item[text_embedding]: <class 'torch.Tensor'> torch.Size([300])
 pos_item[actor_ids]: <class 'torch.Tensor'> torch.Size([5])
 pos_item[director_ids]: <class 'torch.Tensor'> torch.Size([3])
 pos_item[genre_ids]: <class 'torch.Tensor'> torch.Size([9])

--- NEG ITEM ---
 neg_item[dense_featur

In [35]:
def collate_TT(batch):
    '''
    Pelny batchowanie danych do uczenia
    '''
    user_movies, user_ratings, user_times, user_stats = [], [], [], []
    pos_dense, pos_text, pos_actor, pos_director, pos_genre = [], [], [], [], []
    neg_dense, neg_text, neg_actor, neg_director, neg_genre = [], [], [], [], []

    for row in batch:

        user_stats.append(row['user']['user_statistics'])
        user_movies.append(row['user']['movies'])
        user_ratings.append(row['user']['ratings'])
        user_times.append(row['user']['times'])

        pos_dense.append(row['pos_item']['dense_features'])
        pos_text.append(row['pos_item']['text_embedding'])
        pos_actor.append(row['pos_item']['actor_ids'])
        pos_director.append(row['pos_item']['director_ids'])
        pos_genre.append(row['pos_item']['genre_ids'])

        neg_dense.append(row['neg_item']['dense_features']) # [k, D_feat]
        neg_text.append(row['neg_item']['text_embedding'])  # [k, D_text]
        neg_actor.append(row['neg_item']['actor_ids'])
        neg_director.append(row['neg_item']['director_ids'])
        neg_genre.append(row['neg_item']['genre_ids'])

    batch_user = {
        'user_statistics': torch.stack(user_stats),     # [B, d_stats]
        'movies': torch.stack(user_movies),             # [B, L_u]
        'ratings': torch.stack(user_ratings),           # [B, L_u]
        'times': torch.stack(user_times),               # [B, L_u]
    }

    batch_pos_item = {
        'dense_features': torch.stack(pos_dense),       # [B, dense_feat_dim]
        'text_embedding': torch.stack(pos_text),        # [B, text_emb_dim]
        'actor_ids': torch.stack(pos_actor),            # [B, max_len_a]
        'director_ids':torch.stack(pos_director),       # [B, max_len_d]
        'genre_ids': torch.stack(pos_genre),            # [B, max_len_g]
    }

    batch_neg_item = {
        'dense_features': torch.stack(neg_dense),
        'text_embedding': torch.stack(neg_text),
        'actor_ids': torch.stack(neg_actor),
        'director_ids': torch.stack(neg_director),
        'genre_ids': torch.stack(neg_genre),
    }

    return {
      'user': batch_user,
      'pos_item': batch_pos_item,
      'neg_item': batch_neg_item
    }

# Przygotowanie zbiorów do treningu

In [36]:
'''
Przepisanie poczatkowego - FAISS index
'''
faiss_index = initial_faiss_index
movie_matrix_np = initial_movie_matrix_np
local_to_movie = initial_local_to_movie
movie_to_local = initial_movie_to_local

In [37]:
'''
Wielkosc batcha zalezna od pamieci GPU
'''
BATCH_SIZE = 2048

In [38]:
'''
Wczytanie danych do treningu
'''
train_dataset = TwoTowerDataset(
    df_users,
    df_ratings,
    df_movies
)

train_loader = DataLoader(
    dataset       = train_dataset,
    batch_size    = BATCH_SIZE,
    shuffle       = True,
    # num_workers   = 2,
    pin_memory    = True,
    collate_fn    = collate_TT,
    drop_last     = False
)

--- STATYSTYKI FILMOW (Overlaps) ---
    BLOCKBUSTER:  2,280 (  2.7%)
 HIGHLY_WATCHED:  7,568 (  9.0%)
   HIGHLY_RATED:  8,261 (  9.8%)
     MAINSTREAM: 19,383 ( 23.0%)
          NICHE: 22,589 ( 26.8%)
        OBSCURE: 42,161 ( 50.1%)
Segment pools created.
Identified 15,711 heavy users (>= 430 interactions).
Setup completed.


In [39]:
'''
Wczytanie danych ewaluacyjnych
'''
val_user_ids = df_LOOCV['userId'].tolist()

val_dataset = TwoTowerDataset(
    df_users,
    df_ratings,
    df_movies,
    k_negatives=25
)

val_loader = DataLoader(
    dataset     = val_dataset,
    batch_size  = BATCH_SIZE,
    shuffle     = False,
    pin_memory  = True,
    collate_fn  = collate_TT,
    drop_last   = False
)

test_pos_loocv = {
    u: [movie_to_local[mid]]
    for u, mid in df_LOOCV.set_index('userId')['holdout_movieId'].items()
}

train_pos_sets = {
    u: {movie_to_local[mid] for mid in pos_list}
    for u, pos_list in df_ratings['pos'].items()
}

print(f"Przygotowano dane do LOOCV:")
print(f"Użytkowników z holdout: {len(test_pos_loocv):,}")
print(f"Użytkowników z pozytywami: {len(train_pos_sets):,}")

--- STATYSTYKI FILMOW (Overlaps) ---
    BLOCKBUSTER:  2,280 (  2.7%)
 HIGHLY_WATCHED:  7,568 (  9.0%)
   HIGHLY_RATED:  8,261 (  9.8%)
     MAINSTREAM: 19,383 ( 23.0%)
          NICHE: 22,589 ( 26.8%)
        OBSCURE: 42,161 ( 50.1%)
Segment pools created.
Identified 15,711 heavy users (>= 430 interactions).
Setup completed.
Przygotowano dane do LOOCV:
Użytkowników z holdout: 157,023
Użytkowników z pozytywami: 157,023


In [40]:
'''
Do wczytania i obliczania item embeedings
'''
movie_loader = DataLoader(
    MovieDataset(df_movies),
    batch_size=8192,
    collate_fn=lambda batch: {
        'pos_item': {
            'dense_features': torch.stack([b[0] for b in batch]),
            'text_embedding': torch.stack([b[1] for b in batch]),
            'actor_ids':      torch.stack([b[2] for b in batch]),
            'director_ids':   torch.stack([b[3] for b in batch]),
            'genre_ids':      torch.stack([b[4] for b in batch]),
        }
    }
)

In [41]:
'''
TEST CUSTOMOWEJ FUNKCJI collateTT I DATALOADER-OW
'''
device = torch.device("cuda")
dataset_test = TwoTowerDataset(df_users, df_ratings, df_movies)

loader_test_full = DataLoader(
    dataset_test,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_TT,
)

batch_test = next(iter(loader_test_full))

print("=== USER ===")
for k,v in batch_test['user'].items():
    print(f"{k:10s}:", v.shape)

print("\n=== POS ITEM ===")
for k,v in batch_test['pos_item'].items():
    print(f"{k:15s}:", v.shape)

print("\n=== NEG ITEM ===")
for k,v in batch_test['neg_item'].items():
    print(f"{k:15s}:", v.shape)

--- STATYSTYKI FILMOW (Overlaps) ---
    BLOCKBUSTER:  2,280 (  2.7%)
 HIGHLY_WATCHED:  7,568 (  9.0%)
   HIGHLY_RATED:  8,261 (  9.8%)
     MAINSTREAM: 19,383 ( 23.0%)
          NICHE: 22,589 ( 26.8%)
        OBSCURE: 42,161 ( 50.1%)
Segment pools created.
Identified 15,711 heavy users (>= 430 interactions).
Setup completed.
=== USER ===
user_statistics: torch.Size([4, 25])
movies    : torch.Size([4, 19])
ratings   : torch.Size([4, 19])
times     : torch.Size([4, 19])

=== POS ITEM ===
dense_features : torch.Size([4, 24])
text_embedding : torch.Size([4, 300])
actor_ids      : torch.Size([4, 5])
director_ids   : torch.Size([4, 3])
genre_ids      : torch.Size([4, 9])

=== NEG ITEM ===
dense_features : torch.Size([4, 50, 24])
text_embedding : torch.Size([4, 50, 300])
actor_ids      : torch.Size([4, 50, 5])
director_ids   : torch.Size([4, 50, 3])
genre_ids      : torch.Size([4, 50, 9])


In [42]:
batch_test_3 = next(iter(train_loader))

print("=== USER ===")
for k,v in batch_test_3['user'].items():
    print(f"{k:10s}:", v.shape)

print("\n=== POS ITEM ===")
for k,v in batch_test_3['pos_item'].items():
    print(f"{k:15s}:", v.shape)

print("\n=== NEG ITEM ===")
for k,v in batch_test_3['neg_item'].items():
    print(f"{k:15s}:", v.shape)

=== USER ===
user_statistics: torch.Size([2048, 25])
movies    : torch.Size([2048, 19])
ratings   : torch.Size([2048, 19])
times     : torch.Size([2048, 19])

=== POS ITEM ===
dense_features : torch.Size([2048, 24])
text_embedding : torch.Size([2048, 300])
actor_ids      : torch.Size([2048, 5])
director_ids   : torch.Size([2048, 3])
genre_ids      : torch.Size([2048, 9])

=== NEG ITEM ===
dense_features : torch.Size([2048, 50, 24])
text_embedding : torch.Size([2048, 50, 300])
actor_ids      : torch.Size([2048, 50, 5])
director_ids   : torch.Size([2048, 50, 3])
genre_ids      : torch.Size([2048, 50, 9])


# ARCHITEKTURA TWO TOWER

In [43]:
EMB_DIM = 64

class UserTower(nn.Module):
    def __init__(self, input_dim, n_items, embedding_dim=EMB_DIM):
        '''
        input_dim - the number of columns in user features, without sequence columns
        '''
        super().__init__()

        self.item_emb = nn.Embedding(n_items, embedding_dim)

        # A layer to project rating and timestamp into a scalar weight
        self.rating_proj = nn.Linear(2, 1)

        self.mlp = nn.Sequential(
            nn.Linear(input_dim + embedding_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 384),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )

    def forward(self, batch):
        # Embed movieIds liked by user
        m = self.item_emb(batch['movies'])

        # Get weights
        x = torch.stack([batch['ratings'], batch['times']], dim=-1) # [B, L_u, 2]
        w = torch.sigmoid(self.rating_proj(x))

        # weighted mean-pool
        pooled = (m * w).sum(1) / w.sum(1).clamp_min(1e-6)   # [B, D]

        input = torch.cat([batch['user_statistics'], pooled], dim=-1) # [B, stats+EMB_DIM]
        output = self.mlp(input)                                    # [B, EMB_DIM]
        u = F.normalize(output, dim = 1)
        return u


class ItemTower(nn.Module):
    def __init__(self,dense_feat_dim,text_emb_dim,vocab_sizes,embedding_dim=EMB_DIM):
        '''
        vocab_sizes - tuple odpowiednio n_actors, n_directors, n_genres
        dense_feat_dim – wymiary numeric+binary+decades+text
        tex_emb_dim - Wektor o wielkosc 300 opisujacy dane tekstowe filmu
        '''
        super().__init__()

        self.actor_emb = nn.Embedding(vocab_sizes[0], embedding_dim)
        self.director_emb = nn.Embedding(vocab_sizes[1], embedding_dim)
        self.genre_emb = nn.Embedding(vocab_sizes[2], embedding_dim)

        self.meta_mlp = nn.Sequential(
            nn.Linear(dense_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, embedding_dim),
            nn.ReLU()
        )

        self.text_mlp = nn.Sequential( #--- to consider za ostre zejscie z 512 -> 64, moze posredni 256
            nn.Linear(text_emb_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, embedding_dim),
            nn.ReLU()
        )

        MLP_INPUT_DIM = embedding_dim*5 # odpowiednio nn.Embeedings * 3 oraz meta_mlp oraz text_mlp
        self.final_mlp = nn.Sequential(
            nn.Linear(MLP_INPUT_DIM, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512,256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256,embedding_dim)
        )

    def forward(self, batch, key: str = "pos_item"):

        dense_feats = batch[key]['dense_features']     # [B, dense_feat_dim]
        text_emb = batch[key]['text_embedding']     # [B, text_emb_dim]

        actor_ids = batch[key]['actor_ids']         # [B, max_len_a]
        director_ids = batch[key]['director_ids']
        genre_ids = batch[key]['genre_ids']

        if dense_feats.dim() == 3:
            B, k, Z = dense_feats.shape

            # flattenujemy
            dense_flat     = dense_feats.view(B*k, Z)
            text_flat      = text_emb.view(B*k, -1)
            actor_flat     = actor_ids.view(B*k, -1)
            director_flat  = director_ids.view(B*k, -1)
            genre_flat     = genre_ids.view(B*k, -1)

            # złożony batch
            flat_batch = {
                key: {
                    'dense_features':  dense_flat,
                    'text_embedding':  text_flat,
                    'actor_ids':       actor_flat,
                    'director_ids':    director_flat,
                    'genre_ids':       genre_flat,
                }
            }

            emb_flat = self.forward(flat_batch, key)    # rekurencyjnie batch na embeddingi [B*k, D]

            return emb_flat.view(B, k, -1)              # [B, k, D]

        dense_vec = self.meta_mlp(dense_feats)      # [B, D]
        text_vec = self.text_mlp(text_emb)          # [B, D]

        cast_imp = dense_feats[:, 2:3]              # [B, 1]
        director_score = dense_feats[:, 3:4]        # [B, 1]

        a = self.actor_emb   (actor_ids).mean(dim=1)    # [B, D]
        d = self.director_emb(director_ids).mean(dim=1) # [B, D]
        g = self.genre_emb   (genre_ids).mean(dim=1)    # [B, D]

        # We add weights based on importance score
        a = a * cast_imp
        d = d * director_score #--- do rozwazenia Max pooling lub Attention pooling

        input = torch.cat([a, d, g, dense_vec, text_vec], dim=-1)   # [B, 5D]
        output = self.final_mlp(input)                              # [B, D]
        i = F.normalize(output, dim=1)
        return i


In [44]:
class TwoTowerModel(nn.Module):
    def __init__(self, stats_dim, n_items, vocab_sizes,
                 dense_feat_dim, text_emb_dim, embedding_dim=EMB_DIM):
        super().__init__()
        self.user_tower = UserTower(stats_dim, n_items, embedding_dim)
        self.item_tower = ItemTower(dense_feat_dim, text_emb_dim, vocab_sizes, embedding_dim)

    def forward(self, batch):
        u = self.user_tower(batch['user'])
        i_pos = self.item_tower(batch, key="pos_item")
        i_neg = self.item_tower(batch, key="neg_item")

        if i_neg.dim() == 2:
            return u, i_pos, i_neg # każdy [B, 64]

        B, k, D = i_neg.shape

        i_neg_flat = i_neg.reshape(B*k, D) # Splaszczamy

        u_flat = u.unsqueeze(1).expand(B, k, D).reshape(B*k, D)
        pos_flat = i_pos.unsqueeze(1).expand(B, k, D).reshape(B*k, D)

        return u_flat, pos_flat, i_neg_flat


In [45]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.mps.is_available():
    device = torch.device('mps')
print('Device:', device)

Device: mps


In [46]:
def to_device(data, device):
    if isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif torch.is_tensor(data):
        return data.to(device, non_blocking=True)
    else:
        return data

In [47]:
'''
Przygotowanie matrix-u do leave-one-out w celu 'score' do rankingu
'''
def compute_item_embeddings(model, movie_loader):
    model.eval()
    all_embs = []
    with torch.no_grad():
        for mb in movie_loader:
            mb = to_device(mb, device)

            embs = model.item_tower(mb, key='pos_item')  # [batch_size, D]
            all_embs.append(embs)
    return torch.cat(all_embs, dim=0).cpu().numpy()  # [n_movies, D]

In [48]:
POPULARITY_BIAS_CHECK = False
RANDOM_CHECK = False

model = TwoTowerModel(stats_dim=25,
                           n_items=n_items,
                           vocab_sizes=(num_actors, num_directors, num_genres),
                           dense_feat_dim=24,
                           text_emb_dim=300,
                           embedding_dim=EMB_DIM)

In [49]:
# Przygotowanie danych pod losowe rekomendacje
rng = np.random.default_rng(seed=42)
random_scores = rng.random(n_items)

print(random_scores[:10])

[0.77395605 0.43887844 0.85859792 0.69736803 0.09417735 0.97562235
 0.7611397  0.78606431 0.12811363 0.45038594]


In [50]:
'''
Dokladniejsza ewaluacja majaca odpowiedziec jak model radzi sobie z rankingiem dla danych uzytkownikow
'''
def heavy_evaluate(model,user_loader,item_embs_np,
                        train_pos_sets,test_pos,top_N):
    model.eval()
    user_embs = []

    user_ids_from_loocv = val_user_ids

    if not POPULARITY_BIAS_CHECK and not RANDOM_CHECK:
        with torch.no_grad():
            for raw in user_loader:
                batch = to_device(raw, device)

                u = model.user_tower(batch['user'])  # Skupiamy sie tylko na zebraniu embeddingow uzytkownika

                user_embs.append(u.cpu().numpy())

        user_embs = np.vstack(user_embs)    # [U-liczba uzytkownikow, D]

        assert len(user_ids_from_loocv) == user_embs.shape[0]
    recalls, mrrs, ndcgs = [], [], []

    for idx, user_id in enumerate(user_ids_from_loocv):
        if POPULARITY_BIAS_CHECK:
            scores = popularity_scores.copy()
        elif RANDOM_CHECK:
            scores = random_scores.copy()
        else:
            vec = user_embs[idx]                # [D] wektor emb usera
            scores = item_embs_np @ vec         # [I] wektory score, do oceny czy to dziala poprawnie ? 'iloczyny skalarne'

        mask = np.zeros_like(scores, dtype=bool)
        mask[list(train_pos_sets[user_id])] = True  # Tworzymy maske do odsiania filmow ktore user juz widzial
        scores[mask] = -1e9

        ranked = np.argsort(-scores)[:top_N]        # Ranking
        true_set = set(test_pos[user_id])           # hold-out

        # Recall@K
        recalls.append(int(any(r in true_set for r in ranked)))

        # MRR@K
        rr = 0.0
        for rank, idx in enumerate(ranked, 1):
            if idx in true_set:
                rr = 1.0/rank
                break
        mrrs.append(rr)

        # nDCG@K
        relevance_scores = [1.0 if movie_idx in true_set else 0.0 for movie_idx in ranked]
        dcg = sum(rel / np.log2(rank + 1) for rank, rel in enumerate(relevance_scores, 1) if rel > 0)

        ideal_relevance = [1.0] * min(len(true_set), top_N)
        idcg = sum(rel / np.log2(rank + 1) for rank, rel in enumerate(ideal_relevance, 1))

        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcgs.append(ndcg)

    return float(np.mean(recalls)), float(np.mean(mrrs)), float(np.mean(ndcgs))

# POPULARIT BIAS CHECK

In [51]:
from collections import Counter

all_training_interactions = [item for items in train_pos_sets.values() for item in items]
movie_popularity = Counter(all_training_interactions)

popularity_scores = np.zeros(n_items)
for movie_id, count in movie_popularity.items():
    popularity_scores[movie_id] = count

In [52]:
df_titles = pd.read_csv(DATA_DIR / 'Movies_final_ML.csv', usecols=['movieId','title'])
df_titles = df_titles.set_index('movieId')

idx_to_movieId = {idx: original_id for original_id, idx in movieId_to_idx.items()}  # Musimy remapowac zeby znalezc odpowiednie tytuly filmow

top_10_indices = np.argsort(-popularity_scores)[:10]    # TOP 10

for rank, movie_idx in enumerate(top_10_indices, 1):
    popularity_count = popularity_scores[movie_idx]

    original_id = idx_to_movieId.get(movie_idx, "N/A")

    movie_title = df_titles.loc[original_id]['title']

    print(f"{rank:<5} {original_id:<5} {movie_title:<100} {int(popularity_count)}")

1     296   Pulp Fiction                                                                                         68297
2     318   The Shawshank Redemption                                                                             66747
3     356   Forrest Gump                                                                                         63472
4     2571  The Matrix                                                                                           63133
5     593   The Silence of the Lambs                                                                             61799
6     260   Star Wars                                                                                            55937
7     2959  Fight Club                                                                                           53458
8     527   Schindler's List                                                                                     51385
9     1196  The Empire Strikes Back             

In [53]:
top_ns = [5, 10, 20, 50, 100]

In [54]:
location = 'cpu'
if torch.cuda.is_available():
    location = 'cuda'
elif torch.mps.is_available():
    location = 'mps'

In [55]:
model.load_state_dict(torch.load('best_model.pt', map_location=location))
model = model.to(device)

  model.load_state_dict(torch.load('best_model.pt', map_location=location))


In [None]:
'''
Check against popularity bias
'''

for TOP_N in tqdm(top_ns):
    POPULARITY_BIAS_CHECK = True

    if POPULARITY_BIAS_CHECK:
        print("Computing embeddings....")
        final_item_embs = compute_item_embeddings(model, movie_loader)

        print("Evaluating Two-Tower Model on LOOCV: ")
        POPULARITY_BIAS_CHECK = False
        tt_results = heavy_evaluate(model, val_loader, final_item_embs, train_pos_sets, test_pos_loocv, TOP_N)
        print(f"Two-Tower: Recall@{TOP_N} = {tt_results[0]:.8f}, MRR@{TOP_N} = {tt_results[1]:.8f}, nDCG@{TOP_N} = {tt_results[2]:.8f}")

        print('Evaluating Random on LOOCV: ')
        RANDOM_CHECK = True
        random_results = heavy_evaluate(model, val_loader, final_item_embs, train_pos_sets, test_pos_loocv, TOP_N)
        print(f"Random: Recall@{TOP_N} = {random_results[0]:.12f}, MRR@{TOP_N} = {random_results[1]:.12f}, nDCG@{TOP_N} = {random_results[2]:.12f}")
        RANDOM_CHECK = False

        print("\nEvaluating Popularity Baseline on LOOCV: ")
        POPULARITY_BIAS_CHECK = True
        pop_results = heavy_evaluate(model, val_loader, final_item_embs, train_pos_sets, test_pos_loocv, TOP_N)
        print(f"Popularity Baseline: Recall@{TOP_N} = {pop_results[0]:.8f}, MRR@{TOP_N} = {pop_results[1]:.8f}, nDCG@{TOP_N} = {pop_results[2]:.8f}")

        print(f"Model                | Recall@{TOP_N} | MRR@{TOP_N} | nDCG@{TOP_N}")
        print(f"---------------------|---------------|------------|------------")
        print(f"Two-Tower            | {tt_results[0]:.4f}       | {tt_results[1]:.4f}    | {tt_results[2]:.4f}")
        print(f"Random               | {random_results[0]:.4f}       | {random_results[1]:.4f}    | {random_results[2]:.4f}")
        print(f"Popularity Baseline  | {pop_results[0]:.4f}       | {pop_results[1]:.4f}    | {pop_results[2]:.4f}")


  0%|          | 0/5 [00:00<?, ?it/s]

Computing embeddings....
Evaluating Two-Tower Model on LOOCV: 
