In [15]:
import pandas as pd

movies = pd.read_hdf('Movies_clean_zlib.h5')

print(movies.info())
print(movies.head())

<class 'pandas.core.frame.DataFrame'>
Index: 86477 entries, 0 to 86476
Columns: 5001 entries, movieId to PC5000
dtypes: float32(5001)
memory usage: 1.6 GB
None
    movieId       PC1       PC2       PC3       PC4       PC5       PC6  \
0   79132.0  0.866303 -0.642829  0.281123 -0.803036 -0.493110  0.421818   
1  109487.0  0.861188 -0.806149 -0.407571 -0.395517  0.043821 -0.121446   
2   58559.0  0.883406 -1.049750 -0.664226 -0.505821 -0.593852  0.191181   
3   72998.0  0.981846 -0.575834  0.299759 -0.555580 -0.675494  0.074894   
4   89745.0  0.892868 -0.538894  0.351593 -0.502740 -0.644292  0.130559   

        PC7       PC8       PC9  ...    PC4991    PC4992    PC4993    PC4994  \
0 -0.442064 -0.070447  0.950790  ... -0.005973  0.005530  0.039145 -0.012905   
1 -0.135562 -0.076029  0.408026  ... -0.000813 -0.015787 -0.009785 -0.000321   
2  0.182367  0.074762  0.877157  ...  0.003361  0.024195 -0.043109 -0.014476   
3 -0.273699 -0.186075  0.620269  ... -0.061486 -0.025411 -0.024266 -0

In [16]:
# data_loading.py

import pandas as pd
import torch
from sklearn.model_selection import train_test_split

def load_data(pca_path: str, ratings_path: str):
    df_movies = pd.read_hdf(pca_path)  # zakładamy kolumny: movieId, PC1...PC5000
    df_ratings = pd.read_parquet(ratings_path, columns=['userId', 'movies_seq', 'ratings_seq'])
    return df_movies, df_ratings

def prepare_feature_tensor(df_movies: pd.DataFrame):
    df_movies = df_movies.set_index("movieId")
    movie_id_map = {mid: idx for idx, mid in enumerate(df_movies.index)}
    features_tensor = torch.tensor(df_movies.values, dtype=torch.float32)
    return features_tensor, movie_id_map

def split_data(df_ratings, movie_id_map, rating_threshold=4.0):
    user_dict = {}
    for _, row in df_ratings.iterrows():
        user = row['userId']
        movie_ids = row['movies_seq']
        ratings = row['ratings_seq']
        filtered = [movie_id_map[mid] for mid, r in zip(movie_ids, ratings)
                    if r >= rating_threshold and mid in movie_id_map]
        if len(filtered) >= 2:
            user_dict[user] = filtered

    users = list(user_dict.keys())
    train_users, test_users = train_test_split(users, test_size=0.2, random_state=42)
    val_users, test_users = train_test_split(test_users, test_size=0.5, random_state=42)

    train_dict = {u: user_dict[u] for u in train_users}
    val_dict = {u: user_dict[u] for u in val_users}
    test_dict = {u: user_dict[u] for u in test_users}
    return train_dict, val_dict, test_dict


In [17]:
import random
import numpy as np
from torch.utils.data import Dataset

class ItemTripletDataset(Dataset):
    def __init__(self, user_item_dict, item_pool, max_users=None, triplets_per_user=10):
        if max_users is not None:
            sampled_users = random.sample(list(user_item_dict.items()), k=min(max_users, len(user_item_dict)))
        else:
            sampled_users = list(user_item_dict.items())

        self.user_item_dict = sampled_users
        self.item_pool = np.array(item_pool)
        self.triplets_per_user = triplets_per_user

    def __len__(self):
        return len(self.user_item_dict) * self.triplets_per_user

    def __getitem__(self, idx):
        user_idx = idx // self.triplets_per_user
        user_id, items = self.user_item_dict[user_idx]
        items_set = set(items)

        anchor = random.choice(items)
        positive_pool = [i for i in items if i != anchor]
        pos = random.choice(positive_pool) if positive_pool else anchor

        while True:
            neg = np.random.choice(self.item_pool)
            if neg not in items_set:
                break

        return anchor, pos, neg


In [18]:
import torch
import torch.nn as nn

class ItemTower(nn.Module):
    def __init__(self, input_dim, embedding_dim=64):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )

    def forward(self, x):
        return self.model(x)

    def predict_embeddings(self, x):
        emb = self.model(x)
        return emb / emb.norm(dim=1, keepdim=True)


In [19]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_model(model, dataset, features_tensor, loss_fn, optimizer, device, epochs=15,
                eval_fn=None, eval_data=None, eval_every=10):
    dataloader = DataLoader(dataset, batch_size=512, shuffle=True)
    model.to(device)
    features_tensor = features_tensor.to(device)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        loop = tqdm(dataloader, desc=f"Epoch {epoch}")
        
        for anchor_ids, pos_ids, neg_ids in loop:
            anchor = features_tensor[anchor_ids].to(device)
            pos = features_tensor[pos_ids].to(device)
            neg = features_tensor[neg_ids].to(device)

            anchor_vec = model(anchor)
            pos_vec = model(pos)
            neg_vec = model(neg)

            loss = loss_fn(anchor_vec, pos_vec, neg_vec)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=total_loss / (loop.n + 1))

        print(f"✅ Epoch {epoch}: Total Loss = {total_loss:.4f}")

        if eval_fn is not None and eval_data is not None and epoch % eval_every == 0:
            print("🧪 Ewaluacja po epoce:", epoch)
            model.eval()
            with torch.no_grad():
                item_embeddings = model.predict_embeddings(features_tensor).cpu().numpy()
                metrics = eval_fn(**eval_data, item_embeddings=item_embeddings)
                for metric, value in metrics.items():
                    print(f"{metric}: {value:.4f}")

    return model


In [20]:
# ========= IMPORTY I USTAWIENIA =========
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from Evaluation import evaluate_model_embeddings

# ========= KONFIGURACJA =========
PCA_PATH = "Movies_clean_zlib.h5"
RATINGS_PATH = "Ratings_clean.parquet"
BATCH_SIZE = 512
EPOCHS = 20
EMBEDDING_DIM = 64
LR = 1e-3
RATING_THRESHOLD = 4.0
MAX_USERS = 10000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🔧 Device:", device)

🔧 Device: cuda


In [21]:
# ========= WCZYTANIE FILMÓW (PCA + movieId) =========
df_pca = pd.read_hdf(PCA_PATH)
features_tensor, movie_id_map = prepare_feature_tensor(df_pca)
movie_ids = list(movie_id_map.keys())

In [22]:
# ========= WCZYTANIE OCEN =========
df_movies, df_ratings = load_data(PCA_PATH, RATINGS_PATH)
features_tensor, movie_id_map = prepare_feature_tensor(df_movies)
train_dict, val_dict, test_dict = split_data(df_ratings, movie_id_map, rating_threshold=RATING_THRESHOLD)

In [23]:
# ========= TWORZENIE DATASETU =========
dataset = ItemTripletDataset(
    user_item_dict=train_dict,
    item_pool=list(range(len(movie_ids))),
    max_users=MAX_USERS,
    triplets_per_user=10
)

In [24]:
# ====== INICJALIZACJA MODELU ======
model = ItemTower(input_dim=features_tensor.shape[1], embedding_dim=EMBEDDING_DIM).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.TripletMarginLoss(margin=0.3)

In [25]:
# ========= TRENING =========
trained_model = train_model(
    model=model,
    dataset=dataset,
    features_tensor=features_tensor,
    loss_fn=loss_fn,
    optimizer=optimizer,
    device=device,
    epochs=EPOCHS,
    eval_fn=evaluate_model_embeddings,
    eval_data={"user_item_dict": val_dict, "k": 10, "similarity": "cosine"},
    eval_every=10
)

Epoch 1: 100%|██████████| 196/196 [00:05<00:00, 37.98it/s, loss=0.0511]


✅ Epoch 1: Total Loss = 9.9644


Epoch 2: 100%|██████████| 196/196 [00:04<00:00, 42.17it/s, loss=0.0251]


✅ Epoch 2: Total Loss = 4.8482


Epoch 3: 100%|██████████| 196/196 [00:04<00:00, 43.39it/s, loss=0.0198]


✅ Epoch 3: Total Loss = 3.8341


Epoch 4: 100%|██████████| 196/196 [00:04<00:00, 43.68it/s, loss=0.0188]


✅ Epoch 4: Total Loss = 3.6456


Epoch 5: 100%|██████████| 196/196 [00:04<00:00, 42.01it/s, loss=0.0171]


✅ Epoch 5: Total Loss = 3.3454


Epoch 6: 100%|██████████| 196/196 [00:04<00:00, 41.78it/s, loss=0.0163]


✅ Epoch 6: Total Loss = 3.1671


Epoch 7: 100%|██████████| 196/196 [00:04<00:00, 45.15it/s, loss=0.0165]


✅ Epoch 7: Total Loss = 3.1731


Epoch 8: 100%|██████████| 196/196 [00:04<00:00, 44.09it/s, loss=0.016] 


✅ Epoch 8: Total Loss = 3.0979


Epoch 9: 100%|██████████| 196/196 [00:04<00:00, 41.93it/s, loss=0.0154]


✅ Epoch 9: Total Loss = 2.9895


Epoch 10: 100%|██████████| 196/196 [00:04<00:00, 43.51it/s, loss=0.015] 


✅ Epoch 10: Total Loss = 2.9236
🧪 Ewaluacja po epoce: 10
Precision@K: 0.0012
Recall@K: 0.0120
MRR: 0.0038


Epoch 11: 100%|██████████| 196/196 [00:04<00:00, 41.35it/s, loss=0.0149]


✅ Epoch 11: Total Loss = 2.9022


Epoch 12: 100%|██████████| 196/196 [00:04<00:00, 43.74it/s, loss=0.015] 


✅ Epoch 12: Total Loss = 2.9044


Epoch 13: 100%|██████████| 196/196 [00:04<00:00, 43.27it/s, loss=0.0146]


✅ Epoch 13: Total Loss = 2.8213


Epoch 14: 100%|██████████| 196/196 [00:04<00:00, 43.93it/s, loss=0.0136]


✅ Epoch 14: Total Loss = 2.6726


Epoch 15: 100%|██████████| 196/196 [00:04<00:00, 43.17it/s, loss=0.0146]


✅ Epoch 15: Total Loss = 2.8356


Epoch 16: 100%|██████████| 196/196 [00:04<00:00, 42.11it/s, loss=0.014] 


✅ Epoch 16: Total Loss = 2.6969


Epoch 17: 100%|██████████| 196/196 [00:04<00:00, 44.07it/s, loss=0.0136]


✅ Epoch 17: Total Loss = 2.6726


Epoch 18: 100%|██████████| 196/196 [00:04<00:00, 44.08it/s, loss=0.0134]


✅ Epoch 18: Total Loss = 2.5980


Epoch 19: 100%|██████████| 196/196 [00:04<00:00, 44.00it/s, loss=0.0135]


✅ Epoch 19: Total Loss = 2.6094


Epoch 20: 100%|██████████| 196/196 [00:04<00:00, 42.32it/s, loss=0.0133]


✅ Epoch 20: Total Loss = 2.5734
🧪 Ewaluacja po epoce: 20
Precision@K: 0.0019
Recall@K: 0.0190
MRR: 0.0069


In [26]:
# ========= ZAPIS MODELU I EMBEDDINGÓW =========
torch.save(trained_model.state_dict(), "item_tower_trained.pt")
np.save("item_embeddings.npy", trained_model.predict_embeddings(features_tensor.to(device)).cpu().numpy())

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [ ]:
print("\n🎯 Finalna ewaluacja na testowym:")
final_embeddings = trained_model.predict_embeddings(features_tensor.to(device)).cpu().numpy()
final_metrics = evaluate_model_embeddings(test_dict, final_embeddings, k=10, similarity="cosine")
for k, v in final_metrics.items():
    print(f"{k}: {v:.4f}")
