In [1]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../')

from config.paths import PROCESSED_DATA_PATH, CONFIG_PATH

from utils.data_split import temporal_train_test_split
from utils.metrics import get_top_n, precision_recall_at_k
from utils.config_loader import load_config

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load config and data

In [2]:
# Load config
config = load_config(CONFIG_PATH / "settings.yaml")
model_cfg = config["model"]
model_features_cfg = config["model_features"]
preproc_cfg = config["preprocessing"]
svd_hyperparams = config["svd_hyperparams"]

In [3]:
processed_data_path = PROCESSED_DATA_PATH / "processed_data.parquet"
df = pd.read_parquet(processed_data_path)
df = df.iloc[:int(len(df)*model_cfg['data_sample_fraction'])]
print(f"Using {model_cfg['data_sample_fraction']*100}% of samples ({len(df)} rows)")

Using 25.0% of samples (25098582 rows)


# Split train and test

In [12]:
#train_df, test_df = temporal_train_test_split(df, test_size=model_cfg['test_size'])

# LightGBM experiment

In [13]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error


# Codificar user_id y movie_id
user_enc = LabelEncoder()
item_enc = LabelEncoder()
df["user_idx"] = user_enc.fit_transform(df["customer_id"])
df["item_idx"] = item_enc.fit_transform(df["movie_id"])

# Dividir en train/test
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Features y target
X_train = train[["user_idx", "item_idx"]]
X_test = test[["user_idx", "item_idx"]]
y_train = train["rating"]
y_test = test["rating"]

# Convertir a categorias (LightGBM puede usar esto directamente)
X_train = X_train.astype("category")
X_test = X_test.astype("category")

# Crear dataset de LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

# Entrenar el modelo
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_leaves": 31
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=100,
)

# Predicción y evaluación
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")

RMSE: 1.0005


---

# NCF Models

In [9]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np

# ==== HYPERPARÁMETROS ====
EMB_SIZE = 32
BATCH_SIZE = 1024
EPOCHS = 10
LR = 0.001
DROPOUT_P = 0.5  # Dropout más agresivo
WEIGHT_DECAY = 1e-4  # Más regularización L2

# ==== MAPEO Y SPLIT ====
user2idx = {u: i for i, u in enumerate(df['customer_id'].unique())}
item2idx = {m: i for i, m in enumerate(df['movie_id'].unique())}
df['user_idx'] = df['customer_id'].map(user2idx)
df['item_idx'] = df['movie_id'].map(item2idx)

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)

# ==== DATASET ====
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_ds = RatingsDataset(train_df)
test_ds = RatingsDataset(test_df)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, num_workers=0)

# ==== MODELO CON DROPOUT Y REGULARIZACIÓN ====

class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=EMB_SIZE):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size * 2, 64),  # capas más pequeñas
            nn.ReLU(),
            nn.Dropout(DROPOUT_P),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(DROPOUT_P),
            nn.Linear(32, 1)
        )

    def forward(self, user, item):
        user_vec = self.user_emb(user)
        item_vec = self.item_emb(item)
        x = torch.cat([user_vec, item_vec], dim=1)
        return self.mlp(x).squeeze()

model = NCF(len(user2idx), len(item2idx)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)


# ==== ENTRENAMIENTO ====
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

model = NCF(len(user2idx), len(item2idx)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
loss_fn = nn.MSELoss()

best_val_loss = float('inf')
patience = 3
counter = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for u, i, r in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        u, i, r = u.to(device), i.to(device), r.to(device)
        pred = model(u, i)
        loss = loss_fn(pred, r)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(r)
    avg_train_loss = total_loss / len(train_loader.dataset)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for u, i, r in test_loader:
            u, i, r = u.to(device), i.to(device), r.to(device)
            pred = model(u, i)
            loss = loss_fn(pred, r)
            val_loss += loss.item() * len(r)
    avg_val_loss = val_loss / len(test_loader.dataset)

    print(f"Epoch {epoch+1}: Train MSE {avg_train_loss:.4f}, Val MSE {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), "best_model.pt")  # Guarda el mejor modelo
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping activado")
            break


# ==== EVALUACIÓN ====
model.eval()
with torch.no_grad():
    total_loss = 0
    for u, i, r in test_loader:
        u, i, r = u.to(device), i.to(device), r.to(device)
        pred = model(u, i)
        loss = loss_fn(pred, r)
        total_loss += loss.item() * len(r)
    test_mse = total_loss / len(test_loader.dataset)
    print(f"Test MSE: {test_mse:.4f}")


Usando dispositivo: mps


Epoch 1/10: 100%|██████████| 4903/4903 [02:55<00:00, 27.92it/s]


Epoch 1: Train MSE 1.5517, Val MSE 0.9780


Epoch 2/10: 100%|██████████| 4903/4903 [02:49<00:00, 28.93it/s]


Epoch 2: Train MSE 0.9900, Val MSE 0.9367


Epoch 3/10: 100%|██████████| 4903/4903 [02:50<00:00, 28.76it/s]


Epoch 3: Train MSE 0.9467, Val MSE 0.9216


Epoch 4/10: 100%|██████████| 4903/4903 [02:58<00:00, 27.44it/s]


Epoch 4: Train MSE 0.9292, Val MSE 0.9171


Epoch 5/10: 100%|██████████| 4903/4903 [02:50<00:00, 28.81it/s]


Epoch 5: Train MSE 0.9158, Val MSE 0.9133


Epoch 6/10: 100%|██████████| 4903/4903 [04:20<00:00, 18.81it/s]  


Epoch 6: Train MSE 0.9126, Val MSE 0.9094


Epoch 7/10: 100%|██████████| 4903/4903 [02:48<00:00, 29.09it/s]


Epoch 7: Train MSE 0.9109, Val MSE 0.9077


Epoch 8/10: 100%|██████████| 4903/4903 [02:49<00:00, 28.89it/s]


Epoch 8: Train MSE 0.9093, Val MSE 0.9044


Epoch 9/10: 100%|██████████| 4903/4903 [02:53<00:00, 28.20it/s]


Epoch 9: Train MSE 0.9063, Val MSE 0.8998


Epoch 10/10: 100%|██████████| 4903/4903 [02:51<00:00, 28.63it/s]


Epoch 10: Train MSE 0.9012, Val MSE 0.8960
Test MSE: 0.8960
