In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, CyclicLR, StepLR, CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import os
from typing import Callable, Tuple
from sklearn.model_selection import train_test_split, ParameterGrid, KFold
from sklearn.metrics import root_mean_squared_error

In [None]:
# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
class RatingDataset(Dataset):
    def __init__(self, df):
        self.sids = df['sid'].values.astype(np.int64)
        self.pids = df['pid'].values.astype(np.int64)
        self.ratings = df['rating'].values.astype(np.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.sids[idx], self.pids[idx], self.ratings[idx]
    
class EnhancedNeuMF(nn.Module):
    def __init__(self, num_users, num_items, mf_dim=64, mlp_layer_sizes=[128,64,32], dropout=0.3, activation_fn=nn.LeakyReLU(), leaky_relu_slope=0.1):
        super().__init__()
        # GMF embeddings
        self.user_gmf = nn.Embedding(num_users, mf_dim)
        self.item_gmf = nn.Embedding(num_items, mf_dim)
        # MLP embeddings
        self.user_mlp = nn.Embedding(num_users, mlp_layer_sizes[0])
        self.item_mlp = nn.Embedding(num_items, mlp_layer_sizes[0])

        # Add user and item bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

        # Improved MLP with BatchNorm
        mlp_layers = []
        in_size = mlp_layer_sizes[0] * 2
        for out_size in mlp_layer_sizes[1:]:
          if activation_fn == nn.LeakyReLU:
            activation = activation_fn(negative_slope=leaky_relu_slope)
          else:
            activation = activation_fn()
          mlp_layers += [
            nn.Linear(in_size, out_size),
            nn.BatchNorm1d(out_size),
            activation_fn(),
            nn.Dropout(dropout)
          ]
          in_size = out_size
        self.mlp = nn.Sequential(*mlp_layers)

        # Final prediction
        self.predict = nn.Linear(mf_dim + mlp_layer_sizes[-1], 1)

        # Initialization
        nn.init.normal_(self.user_gmf.weight, std=0.01)
        nn.init.normal_(self.item_gmf.weight, std=0.01)
        nn.init.normal_(self.user_mlp.weight, std=0.01)
        nn.init.normal_(self.item_mlp.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.predict.weight, a=1, nonlinearity='sigmoid')

    def forward(self, u, i):
        # GMF path
        gmf = self.user_gmf(u) * self.item_gmf(i)
        # MLP path
        mlp = self.mlp(torch.cat([self.user_mlp(u), self.item_mlp(i)], dim=1))
        # Combine paths
        x = torch.cat([gmf, mlp], dim=1)
        base_pred = self.predict(x).squeeze()
        # Add bias terms
        u_bias = self.user_bias(u).squeeze()
        i_bias = self.item_bias(i).squeeze()
        return base_pred + u_bias + i_bias

In [None]:
DATA_DIR = "./data"

def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""

    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)

    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.25, random_state=SEED, stratify=df["sid"])
    return train_df, valid_df

def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """

    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)

def tune_enhanced_neumf_cv(
    df_norm,               # full normalized ratings DataFrame
    num_users,             # total # of users
    num_items,             # total # of items
    device,
    n_splits=5,
    random_state=42
):
    """5-fold CV grid search over mf_dim, layer sizes, dropout, weight decay, and lr."""
    # 1) Hyperparameters to tune
    param_grid = {
        'mf_dim':           [16, 32, 64, 128],
        'mlp_layer_sizes': [[64,32,16], [128,64,32], [128,64,32,16], [256,128,64]],
        'dropout':         [0.1, 0.2, 0.3, 0.4],
        'weight_decay':    [1e-4, 5e-3, 1e-3],
        'lr':              [1e-3, 5e-4, 1e-4],
    }

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    best_avg_rmse = float('inf')
    best_params   = None

    for params in ParameterGrid(param_grid):
        fold_rmses = []
        print(f"Evaluating params: {params}")

        # cross-val loop
        for fold_idx, (train_idx, val_idx) in enumerate(kf.split(df_norm), 1):
            train_df, val_df = df_norm.iloc[train_idx], df_norm.iloc[val_idx]

            # rebuild loader each fold
            train_loader = DataLoader(RatingDataset(train_df), batch_size=256, shuffle=True, num_workers=4)

            # fresh model per fold
            model = EnhancedNeuMF(
                num_users=num_users,
                num_items=num_items,
                mf_dim=params['mf_dim'],
                mlp_layer_sizes=params['mlp_layer_sizes'],
                dropout=params['dropout'],
                activation_fn=nn.LeakyReLU,          # fixed
                leaky_relu_slope=0.2            # default for non-leaky or ReLU
            ).to(device)

            # optimizer with tuned lr + weight_decay
            optimizer = optim.AdamW(
                model.parameters(),
                lr=params['lr'],
                weight_decay=params['weight_decay']
            )
            criterion = nn.MSELoss()

            # fixed scheduler
            scheduler = ReduceLROnPlateau

            # train + eval
            rmse = train_enhanced(
                model=model,
                train_df=train_df,
                valid_df=val_df,
                loader=train_loader,
                optimizer=optimizer,
                criterion=criterion,
                device=device,
                scheduler=scheduler,
                scheduler_params={'mode': 'min', 'factor': 0.5, 'patience': 3, 'min_lr': 1e-6},
                epochs=20,
                patience=5
            )
            fold_rmses.append(rmse)
            print(f"  Fold {fold_idx} RMSE: {rmse:.4f}")

        avg_rmse = sum(fold_rmses) / len(fold_rmses)
        print(f" → Avg CV RMSE: {avg_rmse:.4f}\n")

        if avg_rmse < best_avg_rmse:
            best_avg_rmse = avg_rmse
            best_params   = params

    print(f"Best 5-fold CV RMSE: {best_avg_rmse:.4f}")
    print(f"Best params: {best_params}")
    return best_params

def preprocess_data(df):
    """Apply user-specific normalization to ratings"""
    # Get global mean
    global_mean = df['rating'].mean()
    print(f"Global mean rating: {global_mean:.4f}")

    # Get user biases (average rating deviation from global mean)
    user_biases = df.groupby('sid')['rating'].mean() - global_mean

    # Create a copy to avoid modifying the original dataframe
    df_norm = df.copy()

    # Normalize ratings by user bias
    def normalize_rating(row):
        user_id = row['sid']
        return row['rating'] - user_biases.get(user_id, 0)

    df_norm['rating'] = df_norm.apply(normalize_rating, axis=1)

    return df_norm, user_biases, global_mean

def train_enhanced(model, train_df, valid_df, loader, optimizer, criterion, device,
                  scheduler, scheduler_params, epochs=20, patience=5, clip_norm=1.0):
    best_rmse = float('inf')
    early_stop_counter = 0

    # Learning rate scheduler
    scheduler_instance = scheduler(optimizer, **scheduler_params)

    for epoch in range(1, epochs+1):
        # Training step
        model.train()
        total_loss = 0

        for sids, pids, ratings in loader:
            sids, pids, ratings = sids.to(device), pids.to(device), ratings.to(device)
            optimizer.zero_grad()
            preds = model(sids, pids)
            loss = criterion(preds, ratings)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)

            optimizer.step()
            total_loss += loss.item() * len(ratings)

        # Create prediction function for evaluation
        def pred_fn(s, p):
            model.eval()
            with torch.no_grad():
                preds = model(
                    torch.from_numpy(s).to(device),
                    torch.from_numpy(p).to(device)
                ).detach().cpu().numpy()
            return np.clip(preds, 1, 5)

        # Evaluate on both train and validation sets
        train_rmse = evaluate(train_df, pred_fn)
        valid_rmse = evaluate(valid_df, pred_fn)

        # Learning rate scheduling
        if isinstance(scheduler_instance, torch.optim.lr_scheduler.CyclicLR):
          scheduler_instance.step()  # Call step() without arguments for CyclicLR
        else:
          scheduler_instance.step(valid_rmse)  # Call step() with validation loss for other schedulers

        print(f"Epoch {epoch:02d} — Train RMSE: {train_rmse:.4f}, Valid RMSE: {valid_rmse:.4f}, "
              f"LR: {optimizer.param_groups[0]['lr']:.6f}")

        if valid_rmse < best_rmse:
            best_rmse = valid_rmse
            early_stop_counter = 0
            torch.save(model.state_dict(), 'best_ncf.pth')
        else:
            early_stop_counter += 1

        # Early stopping
        if early_stop_counter >= patience:
            print(f"Early stopping triggered after {epoch} epochs")
            break

    print(f"\nBest Val RMSE: {best_rmse:.4f}")
    return best_rmse

In [None]:
# Load data
train_df, valid_df = read_data_df()

In [None]:
# Apply preprocessing
train_df_norm, user_biases, global_mean = preprocess_data(train_df)
valid_df_norm = valid_df.copy()
valid_df_norm['rating'] = valid_df_norm.apply(
    lambda row: row['rating'] - user_biases.get(row['sid'], 0),
    axis=1
)

# Determine number of users and items
num_users = train_df['sid'].max() + 1
num_items = train_df['pid'].max() + 1
print(f"Num users: {num_users}, Num items: {num_items}")

# Prepare data loader with normalized data
train_loader = DataLoader(
    RatingDataset(train_df_norm),
    batch_size=1024,
    shuffle=True,
    num_workers=4
)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
best_params = tune_enhanced_neumf_cv(train_df_norm, num_users, num_items, device)

In [None]:
final_model = EnhancedNeuMF(
    num_users, num_items,
    mf_dim=best_params['mf_dim'],
    mlp_layer_sizes=best_params['mlp_layer_sizes'],
    dropout=best_params['dropout'],
    activation_fn=params['activation_fn']
).to(device)