In [19]:
import pandas as pd

movies = pd.read_parquet('Movies_clean.parquet')
ratings = pd.read_parquet('ratings_clean.parquet')

print(movies.info())
print(movies.head())

print(ratings.info())
print(ratings.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86477 entries, 0 to 86476
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   vote_average          86477 non-null  float64
 1   vote_count            86477 non-null  int64  
 2   release_date          86477 non-null  int64  
 3   revenue               86477 non-null  int64  
 4   runtime               86477 non-null  int64  
 5   budget                86477 non-null  int64  
 6   original_language     86477 non-null  object 
 7   popularity            86477 non-null  float64
 8   production_companies  86477 non-null  object 
 9   production_countries  86477 non-null  object 
 10  spoken_languages      86477 non-null  object 
 11  keywords              86477 non-null  object 
 12  movieId               86477 non-null  int64  
 13  Directors             86477 non-null  object 
 14  Cast                  86477 non-null  object 
 15  StarActors         

In [20]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
from tqdm import tqdm

# ======= KONFIGURACJA =======
MOVIE_PATH = 'Movies_clean.parquet'
RATINGS_PATH = 'Ratings_clean.parquet'
BATCH_SIZE = 512
EPOCHS = 10
EMBEDDING_DIM = 64
LR = 1e-3
RATING_THRESHOLD = 4.0

# ======= WCZYTANIE FILMÓW I ROZBICIE LIST =======
df_movies = pd.read_parquet(MOVIE_PATH)

# Kolumny zawierające listy embeddingowe (object -> list)
list_columns = [
    'original_language', 'production_companies',
    'production_countries', 'spoken_languages', 'keywords',
    'Directors', 'Cast', 'StarActors'
]

for col in list_columns:
    expanded = df_movies[col].apply(pd.Series)
    expanded.columns = [f"{col}_{i}" for i in expanded.columns]
    df_movies = pd.concat([df_movies.drop(columns=[col]), expanded], axis=1)

# Zachowanie movieId i przygotowanie mapowania
movie_ids = df_movies["movieId"].values
movie_id_map = {id_: idx for idx, id_ in enumerate(movie_ids)}

# Konwersja tylko numerycznych cech do tensora
df_numeric = df_movies.select_dtypes(include=[np.number]).drop(columns=["movieId"])
features_tensor = torch.tensor(df_numeric.values, dtype=torch.float32)
num_items = features_tensor.shape[0]

# ======= WCZYTANIE NOWEGO RATINGS Z SEKWENCJAMI =======
df_seq = pd.read_parquet(RATINGS_PATH, columns=['userId', 'movies_seq', 'ratings_seq'])

# ======= GENERACJA train_dict z ocenami >= 4.0 =======
train_dict = {}
test_dict = {}

for _, row in df_seq.iterrows():
    user = row['userId']
    movie_ids = row['movies_seq']
    ratings = row['ratings_seq']

    filtered_items = [movie_id_map[mid] for mid, r in zip(movie_ids, ratings)
                      if r >= RATING_THRESHOLD and mid in movie_id_map]

    if len(filtered_items) >= 2:
        train_dict[user] = filtered_items[:-1]
        test_dict[user] = filtered_items[-1]  # ostatni film jako cel testowy


# ======= DATASET Z NEGATIVE SAMPLING =======
class ItemTripletDataset(Dataset):
    def __init__(self, user_item_dict, item_pool, max_triplets_per_user=10):
        self.user_item_dict = user_item_dict
        self.item_pool = item_pool
        self.max_triplets_per_user = max_triplets_per_user
        self.triplets = self.generate_triplets()

    def generate_triplets(self):
        triplets = []
        for user, items in tqdm(self.user_item_dict.items(), desc="🔄 Generowanie tripletów"):
            if len(items) < 2:
                continue

            anchors = random.sample(items, min(len(items), self.max_triplets_per_user))
            for anchor in anchors:
                positive_pool = [i for i in items if i != anchor]
                if not positive_pool:
                    continue
                pos = random.choice(positive_pool)

                negative_pool = [i for i in self.item_pool if i not in items]
                if not negative_pool:
                    continue
                neg = random.choice(negative_pool)

                triplets.append((anchor, pos, neg))

        return triplets

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        return self.triplets[idx]

# ======= MODEL ITEM TOWER =======
class ItemTower(nn.Module):
    def __init__(self, input_dim, embedding_dim=EMBEDDING_DIM):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)
        )

    def forward(self, x):
        return self.model(x)

    def predict_embeddings(self, x):
        emb = self.model(x)
        return emb / emb.norm(dim=1, keepdim=True)  # normalizacja

# ======= CUDA SETUP =======
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("🔧 Device:", device)
features_tensor = features_tensor.to(device)

🔧 Device: cuda


In [21]:
# ======= PRZYGOTOWANIE TRENINGU =======
dataset = ItemTripletDataset(train_dict, list(range(num_items)))
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

🔄 Generowanie tripletów:   2%|▏         | 3422/195561 [05:33<5:12:25, 10.25it/s] 


KeyboardInterrupt: 

In [ ]:
# ====== INICJALIZACJA MODELU ======
model = ItemTower(input_dim=features_tensor.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.TripletMarginLoss(margin=0.3)

In [ ]:
# ======= TRENING =======
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for anchor_ids, pos_ids, neg_ids in loop:
        anchor = features_tensor[anchor_ids]
        pos = features_tensor[pos_ids]
        neg = features_tensor[neg_ids]

        anchor_vec = model(anchor)
        pos_vec = model(pos)
        neg_vec = model(neg)

        loss = loss_fn(anchor_vec, pos_vec, neg_vec)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=total_loss / (loop.n + 1))

    print(f"✅ Epoch {epoch+1}: Loss = {total_loss:.4f}")

# ======= ZAPISZ MODEL I EMBEDDINGI =======
torch.save(model.state_dict(), "/mnt/data/item_tower_trained.pt")
embeddings = model.predict_embeddings(features_tensor).cpu().detach().numpy()
np.save("/mnt/data/item_embeddings.npy", embeddings)