In [1]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../')

from config.paths import PROCESSED_DATA_PATH, CONFIG_PATH

from utils.data_split import temporal_train_test_split
from utils.metrics import get_top_n, precision_recall_at_k
from utils.config_loader import load_config

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load config and data

In [2]:
# Load config
config = load_config(CONFIG_PATH / "settings.yaml")
model_cfg = config["model"]
model_features_cfg = config["model_features"]
preproc_cfg = config["preprocessing"]
svd_hyperparams = config["svd_hyperparams"]

In [3]:
processed_data_path = PROCESSED_DATA_PATH / "processed_data.parquet"
data = pd.read_parquet(processed_data_path)
df = data.iloc[:int(len(data)*model_cfg['data_sample_fraction'])]
print(f"Using {model_cfg['data_sample_fraction']*100}% of samples ({len(df)} rows)")

Using 25.0% of samples (25098582 rows)


# Split train and test

In [4]:
train_df, test_df = temporal_train_test_split(df, test_size=model_cfg['test_size'])

In [6]:
df

Unnamed: 0,movie_id,customer_id,rating,date
0,1,1488844,3.0,2005-09-06
1,1,822109,5.0,2005-05-13
2,1,885013,4.0,2005-10-19
3,1,30878,4.0,2005-12-26
4,1,823519,3.0,2004-05-03
...,...,...,...,...
25118140,4675,977471,5.0,2005-10-04
25118141,4675,2011464,3.0,2005-11-21
25118142,4675,2057828,2.0,2005-08-30
25118143,4675,1565082,4.0,2004-04-30


# Model experiments

In [7]:
df

Unnamed: 0,movie_id,customer_id,rating,date
0,1,1488844,3.0,2005-09-06
1,1,822109,5.0,2005-05-13
2,1,885013,4.0,2005-10-19
3,1,30878,4.0,2005-12-26
4,1,823519,3.0,2004-05-03
...,...,...,...,...
25118140,4675,977471,5.0,2005-10-04
25118141,4675,2011464,3.0,2005-11-21
25118142,4675,2057828,2.0,2005-08-30
25118143,4675,1565082,4.0,2004-04-30


In [8]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


# Mapear IDs a índices consecutivos
user2idx = {u: i for i, u in enumerate(df['customer_id'].unique())}
item2idx = {m: i for i, m in enumerate(df['movie_id'].unique())}
df['user_idx'] = df['customer_id'].map(user2idx)
df['item_idx'] = df['movie_id'].map(item2idx)

# Train/test split
train_df, test_df = temporal_train_test_split(df, test_size=0.2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_idx'] = df['customer_id'].map(user2idx)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['item_idx'] = df['movie_id'].map(item2idx)


In [9]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=50):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_vec = self.user_emb(user)
        item_vec = self.item_emb(item)
        x = torch.cat([user_vec, item_vec], dim=1)
        return self.mlp(x).squeeze()


In [None]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_ds = RatingsDataset(train_df)
test_ds = RatingsDataset(test_df)

train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=1024)

# Entrenar
model = NCF(len(user2idx), len(item2idx))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

for epoch in range(5):  # Puedes aumentar esto
    model.train()
    for u, i, r in train_loader:
        pred = model(u, i)
        loss = loss_fn(pred, r)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


In [None]:
model.eval()
with torch.no_grad():
    preds, actuals = [], []
    for u, i, r in test_loader:
        pred = model(u, i)
        preds.extend(pred.numpy())
        actuals.extend(r.numpy())

rmse = np.sqrt(np.mean((np.array(preds) - np.array(actuals)) ** 2))
print(f"Test RMSE: {rmse:.4f}")
