In [9]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../')

from config.paths import PROCESSED_DATA_PATH, CONFIG_PATH

from utils.data_split import temporal_train_test_split
from utils.metrics import get_top_n, precision_recall_at_k
from utils.config_loader import load_config

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load config and data

In [10]:
# Load config
config = load_config(CONFIG_PATH / "settings.yaml")
model_cfg = config["model"]
model_features_cfg = config["model_features"]
preproc_cfg = config["preprocessing"]
svd_hyperparams = config["svd_hyperparams"]

In [11]:
processed_data_path = PROCESSED_DATA_PATH / "processed_data.parquet"
df = pd.read_parquet(processed_data_path)
df = df.iloc[:int(len(df)*model_cfg['data_sample_fraction'])]
print(f"Using {model_cfg['data_sample_fraction']*100}% of samples ({len(df)} rows)")

Using 25.0% of samples (25098582 rows)


# Split train and test

In [12]:
#train_df, test_df = temporal_train_test_split(df, test_size=model_cfg['test_size'])

# Model experiments

In [13]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error


# Codificar user_id y movie_id
user_enc = LabelEncoder()
item_enc = LabelEncoder()
df["user_idx"] = user_enc.fit_transform(df["customer_id"])
df["item_idx"] = item_enc.fit_transform(df["movie_id"])

# Dividir en train/test
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Features y target
X_train = train[["user_idx", "item_idx"]]
X_test = test[["user_idx", "item_idx"]]
y_train = train["rating"]
y_test = test["rating"]

# Convertir a categorias (LightGBM puede usar esto directamente)
X_train = X_train.astype("category")
X_test = X_test.astype("category")

# Crear dataset de LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

# Entrenar el modelo
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_leaves": 31
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=100,
)

# Predicción y evaluación
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")

RMSE: 1.0005


# Deep Learning

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


# Mapear IDs a índices consecutivos
user2idx = {u: i for i, u in enumerate(df['customer_id'].unique())}
item2idx = {m: i for i, m in enumerate(df['movie_id'].unique())}
df['user_idx'] = df['customer_id'].map(user2idx)
df['item_idx'] = df['movie_id'].map(item2idx)

# Train/test split
train_df, test_df = temporal_train_test_split(df, test_size=0.2)


In [None]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=50):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_vec = self.user_emb(user)
        item_vec = self.item_emb(item)
        x = torch.cat([user_vec, item_vec], dim=1)
        return self.mlp(x).squeeze()


In [None]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_ds = RatingsDataset(train_df)
test_ds = RatingsDataset(test_df)

train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=1024)

# Entrenar
model = NCF(len(user2idx), len(item2idx))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

for epoch in range(5):  # Puedes aumentar esto
    model.train()
    for u, i, r in train_loader:
        pred = model(u, i)
        loss = loss_fn(pred, r)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


In [None]:
model.eval()
with torch.no_grad():
    preds, actuals = [], []
    for u, i, r in test_loader:
        pred = model(u, i)
        preds.extend(pred.numpy())
        actuals.extend(r.numpy())

rmse = np.sqrt(np.mean((np.array(preds) - np.array(actuals)) ** 2))
print(f"Test RMSE: {rmse:.4f}")
