In [2]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset

from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

from datetime import datetime
from tqdm import tqdm

import random

from sklearn.model_selection import train_test_split

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [4]:
df_users = pd.read_parquet('user_features_clean.parquet')
df_movies = pd.read_parquet('Movies_clean_Vec_v3.parquet')
df_ratings = pd.read_parquet('ratings_groupped_ids.parquet')

# Przygotowanie User Tower

In [5]:
print(df_users.info())
print(df_ratings.info())
print(df_movies.info())

empty_pos_ratings = df_ratings['pos'].apply(lambda x: len(x) == 0).sum()
empty_neg_ratings = df_ratings['neg'].apply(lambda x: len(x) == 0).sum()

if empty_pos_ratings != 0 or empty_neg_ratings != 0:
    print(f'Empty ratings: pos: {empty_pos_ratings}, neg: {empty_neg_ratings}')
    raise Exception("Users without a single pos/neg rating exist in the ratings_groupped_ids dataset")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198832 entries, 0 to 198831
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   userId                   198832 non-null  int64  
 1   num_rating               198832 non-null  float64
 2   avg_rating               198832 non-null  float64
 3   weekend_watcher          198832 non-null  float64
 4   genre_Action             198832 non-null  float64
 5   genre_Adventure          198832 non-null  float64
 6   genre_Animation          198832 non-null  float64
 7   genre_Comedy             198832 non-null  float64
 8   genre_Crime              198832 non-null  float64
 9   genre_Documentary        198832 non-null  float64
 10  genre_Drama              198832 non-null  float64
 11  genre_Family             198832 non-null  float64
 12  genre_Fantasy            198832 non-null  float64
 13  genre_History            198832 non-null  float64
 14  genr

In [6]:
# CHECKING movieID coverage

movie_ids_all = set(df_movies['movieId'])

movie_ids_ratings = set(
    m for pos, neg in zip(df_ratings['pos'], df_ratings['neg'])
    for m in list(pos) + list(neg)
)

# Tylko 33 tysiace zostaje ???
# movie_ids_users = set(
#     m for seq in df_users['movies_seq'] for m in seq
# )

valid_movie_ids = movie_ids_all & movie_ids_ratings # & movie_ids_users

print(f"Liczba wspólnych movieId: {len(valid_movie_ids)}")

df_ratings['pos'] = df_ratings['pos'].apply(lambda lst: [m for m in lst if m in valid_movie_ids])
df_ratings['neg'] = df_ratings['neg'].apply(lambda lst: [m for m in lst if m in valid_movie_ids])
df_ratings = df_ratings[(df_ratings['pos'].str.len() > 0) | (df_ratings['neg'].str.len() > 0)]

df_users['movies_seq'] = df_users['movies_seq'].apply(lambda lst: [m for m in lst if m in valid_movie_ids])
df_users = df_users[df_users['movies_seq'].str.len() > 0]

df_movies = df_movies[df_movies['movieId'].isin(valid_movie_ids)]

Liczba wspólnych movieId: 82918


In [7]:
df_movies = df_movies.reset_index(drop=True)
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82918 entries, 0 to 82917
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   movieId              82918 non-null  int64  
 1   runtime              82918 non-null  float64
 2   if_blockbuster       82918 non-null  int64  
 3   highly_watched       82918 non-null  int64  
 4   release_year         82918 non-null  float64
 5   highly_rated         82918 non-null  int64  
 6   engagement_score     82918 non-null  float64
 7   cast_importance      82918 non-null  float64
 8   director_score       82918 non-null  float64
 9   has_keywords         82918 non-null  int64  
 10  has_cast             82918 non-null  int64  
 11  has_director         82918 non-null  int64  
 12  genre_ids            82918 non-null  object 
 13  decade_[1890, 1900)  82918 non-null  bool   
 14  decade_[1900, 1910)  82918 non-null  bool   
 15  decade_[1910, 1920)  82918 non-null 

In [8]:
unique_ids = set(
        df_users['movies_seq'].explode().tolist()
        + df_ratings['pos'].explode().tolist() 
        + df_ratings['neg'].explode().tolist()
    )

print('Unique movieIds:', len(unique_ids))
unique_ids = sorted(unique_ids)

movieId_to_idx = {id_: idx for idx, id_ in enumerate(unique_ids)}
print('min idx:', min(movieId_to_idx.values()))
print('max idx:', max(movieId_to_idx.values()))

n_items = len(unique_ids)

assert min(movieId_to_idx.values()) == 0
assert max(movieId_to_idx.values()) == n_items - 1

Unique movieIds: 82918
min idx: 0
max idx: 82917


In [9]:
# Zmapuj movieId do indeksów
df_users['movies_seq'] = df_users['movies_seq'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings['pos'] = df_ratings['pos'].apply(lambda lst: [movieId_to_idx[m] for m in lst])
df_ratings['neg'] = df_ratings['neg'].apply(lambda lst: [movieId_to_idx[m] for m in lst])

# df_movies musi być ograniczone tylko do używanych filmów
df_movies = df_movies[df_movies['movieId'].isin(movieId_to_idx)]
df_movies['movie_idx'] = df_movies['movieId'].map(movieId_to_idx)

# Final sanity check
assert df_users['movies_seq'].explode().max() < n_items
assert df_ratings['pos'].explode().max() < n_items
assert df_ratings['neg'].explode().max() < n_items
assert df_movies['movie_idx'].max() < n_items

In [10]:
max_movie_idx = df_users['movies_seq'].explode().max()
print("max_movie_idx =", max_movie_idx)
print("n_items =", n_items)

assert max_movie_idx < n_items, "Indeks filmu przekracza rozmiar embeddingu"

max_movie_idx = 82917
n_items = 82918


In [11]:
import numpy as np
def has_invalid_entries(seq_col):
    return seq_col.explode().isin([-1, np.nan, None]).any()

print("Zawiera niepoprawne wartości:", has_invalid_entries(df_users['movies_seq']))


Zawiera niepoprawne wartości: False


In [12]:
# # Convert movieIds in ratings_groupped_ids to the ones accepted by nn.Embedding
# def map_list(col):
#     return [movieId_to_idx[m] for m in col]
# 
# for df, col in [
#     (df_users, 'movies_seq'),
#     (df_ratings, 'pos'),
#     (df_ratings, 'neg')]:
#     df[col] = df[col].apply(map_list)
# 
# 
# max_idx = max(movieId_to_idx.values())
# assert all(0 <= id_ <= max_idx for l in df_ratings['pos'] for id_ in l)
# assert all(0 <= id_ <= max_idx for l in df_ratings['neg'] for id_ in l)

# Z Botem cos takiego ciagle wyrzuca blad
def safe_map_list(col):
    return [movieId_to_idx.get(m, -1) for m in col]  # -1 oznacza "nieznany"

for df, col in [
    (df_users, 'movies_seq'),
    (df_ratings, 'pos'),
    (df_ratings, 'neg')]:
    df[col] = df[col].apply(safe_map_list)

# sprawdź czy masz jakiekolwiek -1
for df, col in [
    (df_users, 'movies_seq'),
    (df_ratings, 'pos'),
    (df_ratings, 'neg')]:
    assert all(m >= 0 for l in df[col] for m in l), f"🚨 Uwaga: nieznane movieId w {col}"


AssertionError: 🚨 Uwaga: nieznane movieId w movies_seq

In [None]:
df_ratings.info()

In [None]:
# class UserDataset(Dataset):
#     def __init__(self, user_features):
#         self.data = user_features
#     
#     def __len__(self):
#         return len(self.data)
# 
#     def __getitem__(self, idx):
#         return self.data.iloc[idx]

In [None]:
# n_items = len(unique_ids)
# 
# def collate_user(batch):
#     user_features, movies, ratings, timestamps, pos, neg = [], [], [], [], [], []
# 
#     for row in batch:
#         movies.append(torch.tensor(row['movies_seq'], dtype=torch.long))
#         ratings.append(torch.tensor(row['ratings_seq'], dtype=torch.float32))
#         timestamps.append(torch.tensor(row['ts_seq'], dtype=torch.float32))
# 
#         userId = row['userId']
# 
#         r = row[['num_rating', 'avg_rating', 'weekend_watcher', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Music', 'genre_Mystery', 'genre_Romance', 'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western', 'type_of_viewer_negative', 'type_of_viewer_neutral', 'type_of_viewer_positive']]
#         r = r.astype('float32').values
#         user_features.append(torch.tensor(r, dtype=torch.float32))
#         
#         # Get a random movieId that was rated positively and one that was rated negatively. 
#         # Used during training to calculate BPR loss. 
#         posAndNegRow = df_ratings[df_ratings['userId'] == userId].iloc[0]
#         pos.append(torch.tensor(random.choice(posAndNegRow['pos']), dtype=torch.long))
#         neg.append(torch.tensor(random.choice(posAndNegRow['neg']), dtype=torch.long))
# 
#     return {
#         "input": {
#             "user_features": torch.stack(user_features),
#             "movies": torch.stack(movies),
#             "ratings": torch.stack(ratings),
#             "timestamps": torch.stack(timestamps),
#         },
#         "pos": torch.as_tensor(pos, dtype=torch.long),
#         "neg": torch.as_tensor(neg, dtype=torch.long)
#     }

# Przygotowanie Item Tower

In [None]:
# import torch
# from torch.utils.data import Dataset
# 
# class ItemDataset(Dataset):
#     def __init__(self, df_movies, movie_id_map, features_tensor):
#         """
#         Args:
#             df_movies: DataFrame z danymi o filmach (np. z 'prepare_feature_tensor')
#             movie_id_map: Mapa identyfikatorów filmów
#             features_tensor: Tensor z cechami filmów (np. numeryczne + binarne)
#         """
#         self.df_movies = df_movies
#         self.movie_id_map = movie_id_map
#         self.features_tensor = features_tensor
# 
#     def __len__(self):
#         return len(self.df_movies)
# 
#     def __getitem__(self, idx):
#         movie_data = self.df_movies.iloc[idx]
# 
#         movie_id = movie_data['movieId']
#         features = self.features_tensor[self.movie_id_map[movie_id]]  # Pobieramy cechy filmu
#         genre_ids = movie_data['genre_ids']
#         actor_ids = movie_data['actor_ids']
#         director_ids = movie_data['director_ids']
#         text_embedded = movie_data['text_embedded']
# 
#         # Przygotowanie danych w słowniku
#         data = {
#             'movieId': movie_id,
#             'features': features,
#             'genre_ids': genre_ids,
#             'actor_ids': actor_ids,
#             'director_ids': director_ids,
#             'text_embedded': text_embedded
#         }
# 
#         return data


In [None]:
# def collate_item(batch):
#     movies, features, genre_ids, actor_ids, director_ids = [], [], [], [], []
# 
#     for row in batch:
#         movies.append(torch.tensor(row['movieId'], dtype=torch.long))  # Movie ID
#         features.append(torch.tensor(row['features'], dtype=torch.float32))  # Movie Features (num+binary)
#         genre_ids.append(torch.tensor(row['genre_ids'], dtype=torch.long))  # Genre IDs
#         actor_ids.append(torch.tensor(row['actor_ids'], dtype=torch.long))  # Actor IDs
#         director_ids.append(torch.tensor(row['director_ids'], dtype=torch.long))  # Director IDs
#     
#     return {
#         "movies": torch.stack(movies),
#         "features": torch.stack(features),
#         "genre_ids": torch.stack(genre_ids),
#         "actor_ids": torch.stack(actor_ids),
#         "director_ids": torch.stack(director_ids)
#     }


# Przygotowanie danych

In [None]:
def prepare_feature_tensor(df_movies: pd.DataFrame):
    import ast
    # df_movies = df_movies.set_index("movieId").copy()

    for col in ['text_embedded', 'genre_ids', 'actor_ids', 'director_ids']:
        df_movies[col] = df_movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Mapowania ID
    all_actor_ids = set(i for sub in df_movies['actor_ids'] for i in sub)
    all_director_ids = set(i for sub in df_movies['director_ids'] for i in sub)
    all_genre_ids = set(i for sub in df_movies['genre_ids'] for i in sub)

    actor_id_map = {aid: idx for idx, aid in enumerate(sorted(all_actor_ids))}
    director_id_map = {did: idx for idx, did in enumerate(sorted(all_director_ids))}
    genre_id_map = {gid: idx for idx, gid in enumerate(sorted(all_genre_ids))}

    # EmbeddingBag
    def make_bag_inputs(id_lists, id_map):
        flat = []
        offsets = [0]
        for lst in id_lists:
            mapped = [id_map.get(i, 0) for i in lst]
            flat.extend(mapped)
            offsets.append(len(flat))
        return torch.tensor(flat, dtype=torch.long), torch.tensor(offsets[:-1], dtype=torch.long)

    actor_idx_bag, actor_offsets = make_bag_inputs(df_movies['actor_ids'], actor_id_map)
    director_idx_bag, director_offsets = make_bag_inputs(df_movies['director_ids'], director_id_map)
    genre_idx_bag, genre_offsets = make_bag_inputs(df_movies['genre_ids'], genre_id_map)

    text_tensor = np.stack(df_movies['text_embedded'].apply(np.array).to_list())

    numeric_cols = ['runtime', 'engagement_score', 'cast_importance', 'director_score', 'release_year']
    binary_cols = ['if_blockbuster', 'highly_watched', 'highly_rated', 'has_keywords', 'has_cast', 'has_director']
    decade_cols = [col for col in df_movies.columns if col.startswith("decade_")]

    num_bin_tensor = df_movies[numeric_cols + binary_cols + decade_cols].astype(np.float32).values
    full_features = np.hstack([num_bin_tensor, text_tensor])
    features_tensor = torch.tensor(full_features, dtype=torch.float32)

    if torch.isnan(features_tensor).any():
        print("NaN in feature tensor!")
        features_tensor = torch.nan_to_num(features_tensor)

    # movie_id_map = {mid: idx for idx, mid in enumerate(df_movies.index)}

    return (features_tensor,
            actor_idx_bag, actor_offsets,
            director_idx_bag, director_offsets,
            genre_idx_bag, genre_offsets,
            len(actor_id_map), len(director_id_map), len(genre_id_map))

In [None]:
# BATCH_SIZE = 4096
# 
# train_df, test_df = train_test_split(df_ratings, test_size=0.2, random_state=42)
# 
# movie_features, movie_id_map, actor_idx_bag, actor_offsets, director_idx_bag, director_offsets, genre_idx_bag, genre_offsets, num_actors, num_directors, num_genres = prepare_feature_tensor(df_movies)
# 
# # Dataset dla obu wież
# trainDataset = UserDataset(train_df)
# train_user_features = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_user)
# trainItemDataset = ItemDataset(df_movies, movie_id_map, movie_features)
# trainUserDataset = UserDataset(train_user_features)
# 
# # DataLoader dla Item Tower
# trainItemDataLoader = DataLoader(trainItemDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_item)
# # DataLoader dla User Tower
# trainUserDataLoader = DataLoader(trainUserDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_user)

In [None]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.mps.is_available():
    device = torch.device('mps')

print('Device:', device)

# Struktura TWO TOWER

In [None]:
class UserDataset(Dataset):
    def __init__(self, df_users):
        self.data = df_users
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

In [None]:
n_items = len(unique_ids)

def collate_fn(batch):
    df_users, movies, ratings, timestamps, pos, neg = [], [], [], [], [], []

    for row in batch:
        movies.append(torch.tensor(row['movies_seq'], dtype=torch.long))
        ratings.append(torch.tensor(row['ratings_seq'], dtype=torch.float32))
        timestamps.append(torch.tensor(row['ts_seq'], dtype=torch.float32))

        userId = row['userId']

        r = row[['num_rating', 'avg_rating', 'weekend_watcher', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Music', 'genre_Mystery', 'genre_Romance', 'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western', 'type_of_viewer_negative', 'type_of_viewer_neutral', 'type_of_viewer_positive']]
        r = r.astype('float32').values
        
        df_users.append(torch.tensor(r, dtype=torch.float32))
        
        # Get a random movieId that was rated positively and one that was rated negatively. 
        # Used during training to calculate BPR loss. 
        posAndNegRow = df_ratings[df_ratings['userId'] == userId].iloc[0]
        pos.append(torch.tensor(random.choice(posAndNegRow['pos']), dtype=torch.long))
        neg.append(torch.tensor(random.choice(posAndNegRow['neg']), dtype=torch.long))

    return {
        "input": {
            "df_users": torch.stack(df_users),
            "movies": torch.stack(movies),
            "ratings": torch.stack(ratings),
            "timestamps": torch.stack(timestamps),
        },
        "pos": torch.as_tensor(pos, dtype=torch.long),
        "neg": torch.as_tensor(neg, dtype=torch.long)
    }

In [None]:
# class TwoTowerDataset(Dataset):
#     def __init__(self, df_ratings, df_movies, df_users, movie_id_map, features_tensor):
#         """
#         Args:
#             df_ratings: DataFrame zawierający dane o ocenach
#             df_movies: DataFrame zawierający dane o filmach
#             df_users: DataFrame zawierający cechy użytkowników
#             movie_id_map: Mapa identyfikatorów filmów
#             features_tensor: Tensor z cechami filmów
#         """
#         self.df_ratings = df_ratings
#         self.df_movies = df_movies
#         self.df_users = df_users
#         self.movie_id_map = movie_id_map
#         self.features_tensor = features_tensor
# 
#     def __len__(self):
#         return len(self.df_ratings)
# 
#     def __getitem__(self, idx):
#         rating_data = self.df_ratings.iloc[idx]
#         user_data = self.df_users.iloc[idx]
# 
#         user_features = np.array(user_data[[
#             'num_rating', 'avg_rating', 'weekend_watcher',
#             'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy',
#             'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family',
#             'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Music',
#             'genre_Mystery', 'genre_Romance', 'genre_Science Fiction',
#             'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western',
#             'type_of_viewer_negative', 'type_of_viewer_neutral', 'type_of_viewer_positive'
#         ]].values, dtype=np.float32)
# 
#         # movie_id = rating_data['movieId']
#         # features = self.features_tensor[self.movie_id_map[movie_id]]
#         pos_id = random.choice(rating_data['pos'])
#         neg_id = random.choice(rating_data['neg'])
# 
#         return {
#             'userId': rating_data['userId'],
#             'user_features': torch.from_numpy(user_features).view(-1),
#             'pos': pos_id,
#             'neg': neg_id,
#             'movies_seq': torch.tensor(user_data['movies_seq'], dtype=torch.long),
#             'ratings_seq': torch.tensor(user_data['ratings_seq'], dtype=torch.float32),
#             'ts_seq': torch.tensor(user_data['ts_seq'], dtype=torch.float32),
#             'pos_features': self.features_tensor[pos_id],
#             'neg_features': self.features_tensor[neg_id],
#     }

In [None]:
# class TwoTowerDataset(Dataset):
#     def __init__(self, df_ratings, df_users):
#         self.df_ratings = df_ratings.reset_index(drop=True)
#         self.df_users = df_users.set_index("userId")
# 
#     def __len__(self):
#         return len(self.df_ratings)
# 
#     def __getitem__(self, idx):
#         rating_data = self.df_ratings.iloc[idx]
#         user_id = rating_data['userId']
#         user_data = self.df_users.loc[user_id]
# 
#         user_features = np.array(user_data[[
#             'num_rating', 'avg_rating', 'weekend_watcher',
#             'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy',
#             'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family',
#             'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Music',
#             'genre_Mystery', 'genre_Romance', 'genre_Science Fiction',
#             'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western',
#             'type_of_viewer_negative', 'type_of_viewer_neutral', 'type_of_viewer_positive'
#         ]], dtype=np.float32)
# 
#         return {
#             'userId': user_id,
#             'user_features': torch.from_numpy(user_features).view(-1),
#             'movies_seq': torch.tensor(user_data['movies_seq'], dtype=torch.long),
#             'ratings_seq': torch.tensor(user_data['ratings_seq'], dtype=torch.float32),
#             'ts_seq': torch.tensor(user_data['ts_seq'], dtype=torch.float32),
#             'pos': torch.tensor(random.choice(rating_data['pos']), dtype=torch.long),
#             'neg': torch.tensor(random.choice(rating_data['neg']), dtype=torch.long)
#         }


In [None]:
# def collate_fn(batch):
#     user_features, movies, ratings, timestamps, pos, neg = [], [], [], [], [], []
# 
#     for row in batch:
#         user_features.append(row['user_features'])
#         movies.append(row['movies_seq'])
#         ratings.append(row['ratings_seq'])
#         timestamps.append(row['ts_seq'])
#         pos.append(row['pos'])
#         neg.append(row['neg'])
# 
#         # userId = row['userId']
#         # posAndNegRow = df_ratings[df_ratings['userId'] == userId].iloc[0]
#         # pos.append(torch.tensor(random.choice(posAndNegRow['pos']), dtype=torch.long))
#         # neg.append(torch.tensor(random.choice(posAndNegRow['neg']), dtype=torch.long))
#         pos.append(torch.tensor(row['pos'], dtype=torch.long))
#         neg.append(torch.tensor(row['neg'], dtype=torch.long))
# 
#     return {
#         "user_features": torch.stack(user_features),
#         "movies": torch.nn.utils.rnn.pad_sequence(movies, batch_first=True),
#         "ratings": torch.nn.utils.rnn.pad_sequence(ratings, batch_first=True),
#         "timestamps": torch.nn.utils.rnn.pad_sequence(timestamps, batch_first=True),
#         "pos": torch.stack(pos),
#         "neg": torch.stack(neg)
#     }

    # return {
    #     "user_features": torch.stack(user_features),
    #     "movie_ids": torch.stack(movie_ids),
    #     "movie_features": torch.stack(movie_features),
    #     "ratings": torch.stack(ratings),
    #     "timestamps": torch.stack(timestamps),
    #     # "pos": torch.as_tensor(pos, dtype=torch.long),
    #     # "neg": torch.as_tensor(neg, dtype=torch.long)
    #     "pos": torch.stack(pos),
    #     "neg": torch.stack(neg)
    # }

In [None]:
class ItemTower(nn.Module):
    def __init__(self, input_dim, embedding_dim=64, num_actors=10000, num_directors=5000, num_genres=19):
        super(ItemTower, self).__init__()
        self.actor_embedding = nn.EmbeddingBag(num_actors, 32, mode='mean')
        self.director_embedding = nn.EmbeddingBag(num_directors, 32, mode='mean')
        self.genre_embedding = nn.EmbeddingBag(num_genres, 16, mode='mean')

        self.model = nn.Sequential(
            nn.Linear(input_dim + 32 + 32 + 16, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )

    def forward(self, x, actor_bag, actor_offsets,
                      director_bag, director_offsets,
                      genre_bag, genre_offsets):
        actor_emb = self.actor_embedding(actor_bag, actor_offsets)
        director_emb = self.director_embedding(director_bag, director_offsets)
        genre_emb = self.genre_embedding(genre_bag, genre_offsets)

        x = torch.cat([x, actor_emb, director_emb, genre_emb], dim=1)
        return self.model(x)

class UserTower(nn.Module):
    def __init__(self, input_dim, embedding_dim=64, n_items=1000):
        super(UserTower, self).__init__()

        # Item Embeddings for User History
        self.item_emb = nn.Embedding(n_items, embedding_dim)
        
        # A layer to project rating and timestamp into a scalar weight
        self.rating_proj = nn.Linear(2, 1)

        self.mlp = nn.Sequential(
            nn.Linear(input_dim + embedding_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 384),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )
    
    def forward(self, batch):
        # Embed movieIds liked by user
        m = self.item_emb(batch['movies'])

        # Get weights from rating and timestamp
        x = torch.stack([batch['ratings'], batch['timestamps']], dim=-1)
        w = torch.sigmoid(self.rating_proj(x))

        # weighted mean-pool
        pooled = (m * w).sum(1) / (w.sum(1).clamp_min(1e-6))

        input = torch.cat([batch['df_users'], pooled], dim=-1)
        output = self.mlp(input)
        u = F.normalize(output, dim=1)
        return u


# TRENING

In [None]:
def precision_at_k(true_item, recommended_items, k):
    return int(true_item in recommended_items[:k]) / k

def recall_at_k(true_item, recommended_items, k):
    return int(true_item in recommended_items[:k]) / 1

def mrr(true_item, recommended_items):
    if true_item in recommended_items:
        return 1 / (recommended_items.index(true_item) + 1)
    return 0

def ndcg_at_k(true_item, recommended_items, k):
    if true_item in recommended_items[:k]:
        rank = recommended_items.index(true_item)
        return 1 / np.log2(rank + 2)
    return 0.0

def leave_one_out_split(user_item_dict):
    train_dict = {}
    test_dict = {}
    for user, items in user_item_dict.items():
        if isinstance(items, np.ndarray):
            items = items.tolist()
        if len(items) < 2:
            continue
        train_dict[user] = items[:-1]
        test_dict[user] = items[-1]
    return train_dict, test_dict

In [None]:
@torch.no_grad()
def evaluate_model_from_df(df_ratings, item_embeddings, k=10, similarity='dot', max_users=1000):

    grouped = df_ratings[['userId', 'pos']].drop_duplicates('userId')

    test_items = {}
    train_items = {}

    for _, row in grouped.iterrows():
        user_id = row['userId']
        positives = row['pos']
        if len(positives) < 2:
            continue
        train_items[user_id] = positives[:-1]
        test_items[user_id] = positives[-1]

    if max_users and len(test_items) > max_users:
        sampled_users = random.sample(list(test_items.keys()), k=max_users)
        test_items = {u: test_items[u] for u in sampled_users}
        train_items = {u: train_items[u] for u in sampled_users}

    item_embeddings = torch.tensor(item_embeddings, dtype=torch.float32)
    item_embeddings = item_embeddings / item_embeddings.norm(dim=1, keepdim=True)

    precisions, recalls, mrrs, ndcgs = [], [], [], []

    for user_id, true_item in test_items.items():
        history = train_items.get(user_id, [])
        if not history:
            continue

        user_vec = item_embeddings[history].mean(dim=0, keepdim=True)

        if similarity == 'dot':
            scores = torch.matmul(item_embeddings, user_vec.T).squeeze()
        elif similarity == 'cosine':
            scores = F.cosine_similarity(item_embeddings, user_vec)
        else:
            raise ValueError("similarity must be 'dot' or 'cosine'")

        scores[history] = -1e9  # maskowanie historii
        top_k = torch.topk(scores, k=k).indices.tolist()

        precisions.append(precision_at_k(true_item, top_k, k))
        recalls.append(recall_at_k(true_item, top_k, k))
        mrrs.append(mrr(true_item, top_k))
        ndcgs.append(ndcg_at_k(true_item, top_k, k))

    return {
        "Precision@K": np.mean(precisions) if precisions else 0.0,
        "Recall@K": np.mean(recalls) if recalls else 0.0,
        "MRR": np.mean(mrrs) if mrrs else 0.0,
        "nDCG@K": np.mean(ndcgs) if ndcgs else 0.0
    }


In [None]:
# BATCH_SIZE = 4096
# 
# from sklearn.model_selection import train_test_split
# 
# train_u, test_u = train_test_split(user_features, test_size=0.2)
# 
# trainDataset = UserDataset(train_u)
# trainDataLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_user)
# 
# testDataset = UserDataset(test_u)
# testDataLoader = DataLoader(testDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_user)

In [None]:
# from sklearn.model_selection import train_test_split
# 
# # Podziel dane na zestawy treningowe i testowe
# train_i, test_i = train_test_split(item_features, test_size=0.2, random_state=42)
# 
# # Przygotuj odpowiednie dane wejściowe
# movie_features, movie_id_map, actor_idx_bag, actor_offsets, director_idx_bag, director_offsets, genre_idx_bag, genre_offsets, num_actors, num_directors, num_genres = prepare_feature_tensor(df_movies)
# 
# # Stwórz dataset i dataloader dla zestawu treningowego
# trainDataset = ItemDataset(train_i, movie_features, movie_id_map)
# trainDataLoader = DataLoader(trainDataset, batch_size=4096, shuffle=True, collate_fn=collate_item)
# 
# # Stwórz dataset i dataloader dla zestawu testowego
# testDataset = ItemDataset(test_i, movie_features, movie_id_map)
# testDataLoader = DataLoader(testDataset, batch_size=4096, shuffle=True, collate_fn=collate_item)

In [None]:
# device = torch.device('cpu')
# if torch.cuda.is_available():
#     device = torch.device('cuda')
# elif torch.mps.is_available():
#     device = torch.device('mps')
# print('Device:', device)

In [None]:
BATCH_SIZE = 4096

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_users, test_size=0.2)

movie_features, actor_idx_bag, actor_offsets, director_idx_bag, director_offsets, genre_idx_bag, genre_offsets, num_actors, num_directors, num_genres = prepare_feature_tensor(df_movies)

trainDataset = UserDataset(train_df)
trainDataLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

testDataset = UserDataset(test_df)
testDataLoader = DataLoader(testDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
assert movie_features.shape[0] >= n_items

In [None]:
def get_embedding_bag_inputs(indices, bag_tensor, offset_tensor):
    new_offsets = []
    new_bag = []
    offset = 0
    for i in indices:
        i = i.item()
        start = offset_tensor[i].item()
        end = offset_tensor[i + 1].item() if i + 1 < len(offset_tensor) else len(bag_tensor)
        segment = bag_tensor[start:end]
        new_bag.extend(segment.tolist())
        new_offsets.append(offset)
        offset += len(segment)
    return torch.tensor(new_bag, dtype=torch.long), torch.tensor(new_offsets, dtype=torch.long)


In [None]:
def to_device(data, device):
    if isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif torch.is_tensor(data):
        return data.to(device)
    else:
        return data

def train_one_epoch_two_tower(user_tower, item_tower, data_loader, optimizer, device, movie_features,
                              actor_idx_bag, actor_offsets,
                              director_idx_bag, director_offsets,
                              genre_idx_bag, genre_offsets):
    
    user_tower.train()
    item_tower.train()
    running_loss = 0.0
    total = 0
    
    movie_features = movie_features.to(device)
    actor_idx_bag = actor_idx_bag.to(device)
    actor_offsets = actor_offsets.to(device)
    director_idx_bag = director_idx_bag.to(device)
    director_offsets = director_offsets.to(device)
    genre_idx_bag = genre_idx_bag.to(device)
    genre_offsets = genre_offsets.to(device)

    for batch in data_loader:
        batch = to_device(batch, device)
        optimizer.zero_grad()

        user_vec = user_tower(batch['input'])

        actor_pos_bag, actor_pos_offsets = get_embedding_bag_inputs(batch['pos'], actor_idx_bag, actor_offsets)
        director_pos_bag, director_pos_offsets = get_embedding_bag_inputs(batch['pos'], director_idx_bag, director_offsets)
        genre_pos_bag, genre_pos_offsets = get_embedding_bag_inputs(batch['pos'], genre_idx_bag, genre_offsets)

        actor_neg_bag, actor_neg_offsets = get_embedding_bag_inputs(batch['neg'], actor_idx_bag, actor_offsets)
        director_neg_bag, director_neg_offsets = get_embedding_bag_inputs(batch['neg'], director_idx_bag, director_offsets)
        genre_neg_bag, genre_neg_offsets = get_embedding_bag_inputs(batch['neg'], genre_idx_bag, genre_offsets)

        pos_vec = item_tower(movie_features[batch['pos']], actor_pos_bag.to(device), actor_pos_offsets.to(device),
                             director_pos_bag.to(device), director_pos_offsets.to(device),
                             genre_pos_bag.to(device), genre_pos_offsets.to(device))
        
        neg_vec = item_tower(movie_features[batch['neg']], actor_neg_bag.to(device), actor_neg_offsets.to(device),
                             director_neg_bag.to(device), director_neg_offsets.to(device),
                             genre_neg_bag.to(device), genre_neg_offsets.to(device))


        pos_score = (user_vec * pos_vec).sum(dim=-1)
        neg_score = (user_vec * neg_vec).sum(dim=-1)

        loss = -F.logsigmoid(pos_score - neg_score).mean()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total += 1

    return running_loss / total

In [None]:
user_tower = UserTower(input_dim=25, n_items=n_items).to(device)
item_tower = ItemTower(input_dim=movie_features.shape[1]).to(device)

params = list(user_tower.parameters()) + list(item_tower.parameters())
optimizer = optim.Adam(params, lr=1e-3)

In [None]:
# import torch
# import torch.nn.functional as F
# from sklearn.metrics import roc_auc_score
# import numpy as np
# 
# def evaluate_two_tower(user_tower, item_tower, test_loader, movie_features, device, every_k_epochs, current_epoch, k=10):
#     if current_epoch % every_k_epochs != every_k_epochs - 1:
#         return None
# 
#     user_tower.eval()
#     item_tower.eval()
# 
#     aucs = []
#     pair_accs = []
# 
#     with torch.no_grad():
#         for batch in test_loader:
#             batch = to_device(batch, device)
# 
#             u = user_tower({
#                 "user_features": batch['user_features'],
#                 "movies": batch['movies'],  # zakładam, że 'movies' == batch['pos']
#                 "ratings": batch['ratings'],
#                 "timestamps": batch['timestamps']
#             })
# 
#             pos_vec = item_tower(movie_features[batch['pos']])
#             neg_vec = item_tower(movie_features[batch['neg']])
# 
#             pos_score = (u * pos_vec).sum(dim=-1)
#             neg_score = (u * neg_vec).sum(dim=-1)
# 
#             labels = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
#             scores = torch.cat([pos_score, neg_score])
#             aucs.append(roc_auc_score(labels.cpu(), scores.cpu()))
# 
#             acc = (pos_score > neg_score).float().mean().item()
#             pair_accs.append(acc)
# 
#     # Ranking eval
#     item_embeddings = item_tower(movie_features).cpu().detach().numpy()
#     rank_metrics = evaluate_model_from_df(
#         df_ratings=test_df,
#         item_embeddings=item_embeddings,
#         k=k,
#         similarity='cosine',
#         max_users=1000
#     )
# 
#     return {
#         "roc_auc": float(np.mean(aucs)),
#         "pairwise_acc": float(np.mean(pair_accs)),
#         **rank_metrics
#     }

In [None]:
# from datetime import datetime
# from sklearn.metrics import roc_auc_score
# import numpy as np
# import torch
# import torch.nn.functional as F
# from tqdm import tqdm
# 
# def train_loop_two_tower(user_tower, item_tower, train_loader, test_loader, optimizer,
#                          movie_features_tensor, device, epochs=50, eval_every=5):
#     
#     movie_features_tensor = movie_features_tensor.to(device)
# 
#     def to_device(data, device):
#         if isinstance(data, dict):
#             return {k: to_device(v, device) for k, v in data.items()}
#         elif torch.is_tensor(data):
#             return data.to(device)
#         else:
#             return data
# 
#     for epoch in tqdm(range(1, epochs + 1), desc="Epochs"):
#         user_tower.train()
#         item_tower.train()
#         running_loss = 0.0
#         total = 0
# 
#         for batch in train_loader:
#             batch = to_device(batch, device)
# 
#             optimizer.zero_grad()
# 
#             user_vec = user_tower({
#                 "user_features": batch['user_features'],
#                 "movies": batch['movie_ids'],
#                 "ratings": batch['ratings'],
#                 "timestamps": batch['timestamps']
#             })
# 
#             pos_vec = item_tower(movie_features_tensor[batch['movie_ids']])
# 
#             # For simplicity, use shifted movieIds as negatives (in real use: sample negatives)
#             neg_ids = (batch['movie_ids'] + 1) % movie_features_tensor.shape[0]
#             neg_vec = item_tower(movie_features_tensor[neg_ids])
# 
#             pos_score = (user_vec * pos_vec).sum(dim=-1)
#             neg_score = (user_vec * neg_vec).sum(dim=-1)
#             loss = -F.logsigmoid(pos_score - neg_score).mean()
#             loss.backward()
#             optimizer.step()
# 
#             running_loss += loss.item()
#             total += 1
# 
#         avg_loss = running_loss / total
#         print(f"Epoch {epoch} loss: {avg_loss:.4f}")
# 
#         if epoch % eval_every == 0:
#             user_tower.eval()
#             item_tower.eval()
#             aucs = []
#             accs = []
# 
#             with torch.no_grad():
#                 for batch in test_loader:
#                     batch = to_device(batch, device)
#                     
#                     user_vec = user_tower({
#                         "user_features": batch['user_features'],
#                         "movies": batch['movie_ids'],
#                         "ratings": batch['ratings'],
#                         "timestamps": batch['timestamps']
#                     })
# 
#                     pos_vec = item_tower(movie_features_tensor[batch['movie_ids']])
#                     neg_ids = (batch['movie_ids'] + 1) % movie_features_tensor.shape[0]
#                     neg_vec = item_tower(movie_features_tensor[neg_ids])
# 
#                     pos_score = (user_vec * pos_vec).sum(dim=-1)
#                     neg_score = (user_vec * neg_vec).sum(dim=-1)
# 
#                     labels = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
#                     scores = torch.cat([pos_score, neg_score])
#                     auc = roc_auc_score(labels.cpu(), scores.cpu())
#                     aucs.append(auc)
# 
#                     acc = (pos_score > neg_score).float().mean().item()
#                     accs.append(acc)
# 
#             print(f"Epoch {epoch}. ROC AUC: {float(np.mean(aucs)):.4f}, Pairwise Acc: {float(np.mean(accs)):.4f}")


In [None]:
# from tqdm import tqdm
# from datetime import datetime
# 
# EPOCHS = 50
# EVAL_EVERY = 5
# 
# timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# 
# for epoch in tqdm(range(EPOCHS), desc="🚀 Training Two-Tower"):
#     avg_loss = train_one_epoch_two_tower(
#         user_tower=user_tower,
#         item_tower=item_tower,
#         data_loader=trainDataLoader,
#         optimizer=optimizer,
#         device=device,
#         movie_features=movie_features
#     )
#     
#     print(f"[Epoch {epoch + 1}] 🔧 Loss: {avg_loss:.4f}")
# 
#     if epoch % EVAL_EVERY == (EVAL_EVERY - 1):
#         # === punktowa ewaluacja
#         user_tower.eval()
#         item_tower.eval()
# 
#         with torch.no_grad():
#             item_emb = item_tower(movie_features.to(device)).cpu().numpy()
# 
#         rank_metrics = evaluate_model_from_df(
#             df_ratings=test_df,
#             item_embeddings=item_emb,
#             k=10,
#             similarity='cosine',
#             max_users=1000
#         )
# 
#         print(f"[Epoch {epoch + 1}] 📊 Ranking Eval:")
#         print(f"  Precision@K:   {rank_metrics['Precision@K']:.4f}")
#         print(f"  Recall@K:      {rank_metrics['Recall@K']:.4f}")
#         print(f"  MRR:           {rank_metrics['MRR']:.4f}")
#         print(f"  nDCG@K:        {rank_metrics['nDCG@K']:.4f}")


In [None]:
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import roc_auc_score

EPOCHS = 50
EVAL_EVERY = 5
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

for epoch in tqdm(range(EPOCHS), desc="Training Two-Tower"):
    avg_loss = train_one_epoch_two_tower(
        user_tower=user_tower,
        item_tower=item_tower,
        data_loader=trainDataLoader,
        optimizer=optimizer,
        device=device,
        movie_features = movie_features,
        actor_idx_bag=actor_idx_bag,
        actor_offsets=actor_offsets,
        director_idx_bag=director_idx_bag,
        director_offsets=director_offsets,
        genre_idx_bag=genre_idx_bag,
        genre_offsets=genre_offsets
    )
    
    print(f"[Epoch {epoch + 1}] | Loss: {avg_loss:.4f}")

    if epoch % EVAL_EVERY == (EVAL_EVERY - 1):
        user_tower.eval()
        item_tower.eval()

        aucs, pair_accs = [], []

        with torch.no_grad():
            item_emb = item_tower(movie_features.to(device)).cpu().detach().numpy()

            for batch in testDataLoader:
                batch = to_device(batch, device)
                optimizer.zero_grad()

                u = user_tower(batch['input'])

                actor_pos_bag, actor_pos_offsets = get_embedding_bag_inputs(batch['pos'], actor_idx_bag, actor_offsets)
                director_pos_bag, director_pos_offsets = get_embedding_bag_inputs(batch['pos'], director_idx_bag,director_offsets)
                genre_pos_bag, genre_pos_offsets = get_embedding_bag_inputs(batch['pos'], genre_idx_bag,genre_offsets)

                actor_neg_bag, actor_neg_offsets = get_embedding_bag_inputs(batch['neg'], actor_idx_bag,actor_offsets)
                director_neg_bag, director_neg_offsets = get_embedding_bag_inputs(batch['neg'], director_idx_bag, director_offsets)
                genre_neg_bag, genre_neg_offsets = get_embedding_bag_inputs(batch['neg'], genre_idx_bag, genre_offsets)

                pos_vec = item_tower(movie_features[batch['pos']], actor_pos_bag.to(device), actor_pos_offsets.to(device),
                                     director_pos_bag.to(device), director_pos_offsets.to(device),
                                     genre_pos_bag.to(device), genre_pos_offsets.to(device))
                
                neg_vec = item_tower(movie_features[batch['neg']], actor_neg_bag.to(device), actor_neg_offsets.to(device),
                                     director_neg_bag.to(device), director_neg_offsets.to(device),
                                     genre_neg_bag.to(device), genre_neg_offsets.to(device))

                pos_score = (u * pos_vec).sum(dim=-1)
                neg_score = (u * neg_vec).sum(dim=-1)

                labels = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
                scores = torch.cat([pos_score, neg_score])

                aucs.append(roc_auc_score(labels.cpu(), scores.cpu()))
                pair_accs.append((pos_score > neg_score).float().mean().item())

        print(f"[Epoch {epoch + 1}] Pointwise Eval:")
        print(f"  ROC AUC:       {np.mean(aucs):.4f}")
        print(f"  Pairwise Acc:  {np.mean(pair_accs):.4f}")

        rank_metrics = evaluate_model_from_df(
            df_ratings=test_df,
            item_embeddings=item_emb,
            k=10,
            similarity='cosine',
            max_users=1000
        )

        print(f"[Epoch {epoch + 1}] Ranking Eval:")
        print(f"  Precision@K:   {rank_metrics['Precision@K']:.4f}")
        print(f"  Recall@K:      {rank_metrics['Recall@K']:.4f}")
        print(f"  MRR:           {rank_metrics['MRR']:.4f}")
        print(f"  nDCG@K:        {rank_metrics['nDCG@K']:.4f}")


In [None]:
df_users.info()