## Imports

In [None]:
from typing import Tuple, Callable
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import os

## Set seed for reproducibility

In [2]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# 1. Helper Functions

In [None]:
# DATA_DIR = "/cluster/courses/cil/collaborative_filtering/data"
DATA_DIR = "./data"


def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.25, random_state=SEED, stratify=df["sid"])
    return train_df, valid_df


def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)


def make_submission(pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray], filename: os.PathLike):
    """Makes a submission CSV file that can be submitted to kaggle.

    Inputs:
        pred_fn: Function that takes in arrays of sid and pid and outputs a score.
        filename: File to save the submission to.
    """
    
    df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

    # Get sids and pids
    sid_pid = df["sid_pid"].str.split("_", expand=True)
    sids = sid_pid[0]
    pids = sid_pid[1]
    sids = sids.astype(int).values
    pids = pids.astype(int).values
    
    df["rating"] = pred_fn(sids, pids)
    df.to_csv(filename, index=False)

# 2. Load Data

In [4]:
train_df, valid_df = read_data_df()

In [5]:
num_users = train_df['sid'].max() + 1
num_items = train_df['pid'].max() + 1

global_mean = train_df['rating'].mean()
train_df['rating'] = train_df['rating'] - global_mean  # Center the ratings
valid_df['rating'] = valid_df['rating'] - global_mean  # Center validation ratings too

print(f"Global rating mean: {global_mean:.4f}")

def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    # Add global mean to ratings for comparison with predictions
    actual_ratings = valid_df["rating"].values + global_mean
    return root_mean_squared_error(actual_ratings, preds)

Global rating mean: 3.8174


# 3. Create Data Loaders

In [6]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users   = torch.LongTensor(df['sid'].values)
        self.items   = torch.LongTensor(df['pid'].values)
        self.ratings = torch.FloatTensor(df['rating'].values)
    def __len__(self):
        return len(self.ratings)
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_ds = RatingsDataset(train_df)
valid_ds = RatingsDataset(valid_df)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=1024)

# 5. NCF Model (MLP-only)

In [27]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=32, mlp_layers=[64,32,16], dropout=0.2):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        
        mlp_input = emb_size * 2
        layers = []
        for units in mlp_layers:
            layers += [
                nn.Linear(mlp_input, units),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            mlp_input = units
        self.mlp = nn.Sequential(*layers)
        self.out = nn.Linear(mlp_input, 1)
        
    def forward(self, user_idx, item_idx):
        u = self.user_emb(user_idx)
        v = self.item_emb(item_idx)
        x = torch.cat([u, v], dim=-1)
        x = self.mlp(x)
        return self.out(x).squeeze()

In [7]:
class NCF2(nn.Module):
    def __init__(
        self,
        num_users: int,
        num_items: int,
        emb_size: int = 32,
        mlp_layers: list = [64, 32, 16],
        dropout: float = 0.2
    ):
        super().__init__()
        # embeddings
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

        # MLP tower with BatchNorm
        layers = []
        input_dim = emb_size * 2
        for units in mlp_layers:
            layers += [
                nn.Linear(input_dim, units),
                nn.BatchNorm1d(units),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            input_dim = units
        self.mlp = nn.Sequential(*layers)

        # final output
        self.out = nn.Linear(input_dim, 1)

    def forward(self, user_idx, item_idx):
        u = self.user_emb(user_idx)
        v = self.item_emb(item_idx)
        x = torch.cat([u, v], dim=1)
        x = self.mlp(x)
        base = self.out(x).squeeze()
        # bias terms
        b_u = self.user_bias(user_idx).squeeze()
        b_i = self.item_bias(item_idx).squeeze()
        return base + b_u + b_i

In [None]:
# GMF tower
class GMF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=32):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.out = nn.Linear(emb_size, 1)

    def forward(self, user_idx, item_idx):
        u = self.user_emb(user_idx)
        v = self.item_emb(item_idx)
        x = u * v                       # element-wise product
        base = self.out(x).squeeze()
        b_u = self.user_bias(user_idx).squeeze()
        b_i = self.item_bias(item_idx).squeeze()
        return base + b_u + b_i

# Fusion by weighted sum of predictions
class FusedNeuMF(nn.Module):
    def __init__(self, gmf: GMF, mlp: NCF2, alpha: float = 0.5):
        super().__init__()
        self.gmf = gmf
        self.mlp = mlp
        self.alpha = alpha

    def forward(self, user_idx, item_idx):
        pred_gmf = self.gmf(user_idx, item_idx)
        pred_mlp = self.mlp(user_idx, item_idx)
        return self.alpha * pred_gmf + (1 - self.alpha) * pred_mlp


In [8]:
# Full NeuMF with concat-fusion
class NeuMFConcat(nn.Module):
    def __init__(
        self,
        num_users: int,
        num_items: int,
        mf_emb_size: int = 32,
        mlp_emb_size: int = 32,
        mlp_layers: list = [64, 32, 16],
        dropout: float = 0.2
    ):
        super().__init__()
        # — MF (GMF) embeddings —
        self.user_emb_mf = nn.Embedding(num_users, mf_emb_size)
        self.item_emb_mf = nn.Embedding(num_items, mf_emb_size)
        # — MLP embeddings (can be same size or different) —
        self.user_emb_mlp = nn.Embedding(num_users, mlp_emb_size)
        self.item_emb_mlp = nn.Embedding(num_items, mlp_emb_size)

        # — MLP tower (over concatenated mlp_emb) —
        layers = []
        in_dim = mlp_emb_size * 2
        for h in mlp_layers:
            layers += [
                nn.Linear(in_dim, h),
                nn.BatchNorm1d(h),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            in_dim = h
        self.mlp = nn.Sequential(*layers)

        # — Final fusion layer —
        # input dim = mf_emb_size + last-mlp-layer
        self.fusion = nn.Linear(mf_emb_size + in_dim, 1)

    def forward(self, u, i):
        # MF path
        u_mf = self.user_emb_mf(u)
        i_mf = self.item_emb_mf(i)
        mf_vec = u_mf * i_mf                  # elementwise

        # MLP path
        u_ml = self.user_emb_mlp(u)
        i_ml = self.item_emb_mlp(i)
        mlp_vec = torch.cat([u_ml, i_ml], dim=1)
        mlp_vec = self.mlp(mlp_vec)

        # fuse
        z = torch.cat([mf_vec, mlp_vec], dim=1)
        return self.fusion(z).squeeze()


# 6. Setup device, model, loss, optimizer

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# instantiate towers
gmf_model = GMF(num_users, num_items, emb_size=32).to(device)
mlp_model = NCF2(num_users, num_items, emb_size=32).to(device)

# fused model
model = FusedNeuMF(gmf_model, mlp_model, alpha=0.5).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NeuMFConcat(
    num_users=num_users,
    num_items=num_items,
    mf_emb_size=32,
    mlp_emb_size=32,
    mlp_layers=[64,32,16],
    dropout=0.2
).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)

# 7. Training loop with validation

In [10]:
def pred_fn(sids: np.ndarray, pids: np.ndarray) -> np.ndarray:
    model.eval()
    u_idx = torch.LongTensor(sids).to(device)
    i_idx = torch.LongTensor(pids).to(device)
    with torch.no_grad():
        raw_pred = model(u_idx, i_idx).cpu().numpy()
    # add global mean back
    raw_pred += global_mean
    # clamp to valid range
    return np.clip(raw_pred, train_df["rating"].min(), train_df["rating"].max())

In [36]:
best_valid_rmse = float('inf')
patience = 10  # Number of epochs to wait for improvement
epochs_without_improve = 0
best_model_state = None

for epoch in range(1, 51): 
    # Training step
    model.train()
    total_loss = 0
    num_batches = 0
    
    for u, i, r in train_loader:
        u, i, r = u.to(device), i.to(device), r.to(device)
        optimizer.zero_grad()
        pred = model(u, i)
        loss = criterion(pred, r)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        num_batches += 1
    
    avg_loss = total_loss / num_batches
    
    # Evaluation
    train_rmse = evaluate(train_df, pred_fn)
    valid_rmse = evaluate(valid_df, pred_fn)
    
    # Learning rate scheduling
    scheduler.step(valid_rmse)
    
    # Early stopping check
    if valid_rmse < best_valid_rmse:
        best_valid_rmse = valid_rmse
        epochs_without_improve = 0
        best_model_state = model.state_dict().copy()
    else:
        epochs_without_improve += 1
    
    print(
        f"Epoch {epoch:02d} — "
        f"Train RMSE: {train_rmse:.4f}, "
        f"Valid RMSE: {valid_rmse:.4f}, "
        f"LR: {optimizer.param_groups[0]['lr']:.6f}"
    )
    
    # Early stopping
    if epochs_without_improve >= patience:
        print(f"\nEarly stopping triggered after {epoch} epochs")
        break

# Restore best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)

Epoch 01 — Train RMSE: 0.9074, Valid RMSE: 0.9148, LR: 0.001000
Epoch 02 — Train RMSE: 0.8852, Valid RMSE: 0.8961, LR: 0.001000
Epoch 03 — Train RMSE: 0.8824, Valid RMSE: 0.8946, LR: 0.001000
Epoch 04 — Train RMSE: 0.8498, Valid RMSE: 0.8774, LR: 0.001000
Epoch 05 — Train RMSE: 0.8344, Valid RMSE: 0.8733, LR: 0.001000
Epoch 06 — Train RMSE: 0.8180, Valid RMSE: 0.8701, LR: 0.001000
Epoch 07 — Train RMSE: 0.8103, Valid RMSE: 0.8714, LR: 0.001000
Epoch 08 — Train RMSE: 0.8029, Valid RMSE: 0.8714, LR: 0.001000
Epoch 09 — Train RMSE: 0.7940, Valid RMSE: 0.8723, LR: 0.001000
Epoch 10 — Train RMSE: 0.7860, Valid RMSE: 0.8742, LR: 0.000100
Epoch 11 — Train RMSE: 0.7745, Valid RMSE: 0.8790, LR: 0.000100
Epoch 12 — Train RMSE: 0.7697, Valid RMSE: 0.8812, LR: 0.000100
Epoch 13 — Train RMSE: 0.7674, Valid RMSE: 0.8819, LR: 0.000100
Epoch 14 — Train RMSE: 0.7638, Valid RMSE: 0.8835, LR: 0.000010
Epoch 15 — Train RMSE: 0.7622, Valid RMSE: 0.8848, LR: 0.000010
Epoch 16 — Train RMSE: 0.7635, Valid RMS

# 8. RMSE valuation

In [33]:
rmse = evaluate(valid_df, pred_fn)
print(f"\nValidation RMSE: {rmse:.4f}")


Validation RMSE: 0.8859
