# Classic Neural Matrix Factorization (NeuMF) Training Pipeline

1. First train a Generalized Matrix Factorization (GMF) model
2. Then train a Multi-Layer Perceptron (MLP) model
3. Finally fuse these models together and fine-tune

## Setup Environment and Dependencies

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
from typing import Callable, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import gc

# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


## Helper functions

In [2]:
DATA_DIR = "./data"

def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.25)
    return train_df, valid_df

def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)

## Dataset class for PyTorch's DataLoader

In [3]:
class RatingDataset(Dataset):
    def __init__(self, df):
        self.sids = df['sid'].values.astype(np.int64)
        self.pids = df['pid'].values.astype(np.int64)
        self.ratings = df['rating'].values.astype(np.float32)
        
    def __len__(self):
        return len(self.ratings)
        
    def __getitem__(self, idx):
        return self.sids[idx], self.pids[idx], self.ratings[idx]

## Model architectures and training functions

In [4]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, 1)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Initialize weights
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        nn.init.kaiming_uniform_(self.output_layer.weight)
        
    def forward(self, user_indices, item_indices):
        user_embed = self.user_embedding(user_indices)
        item_embed = self.item_embedding(item_indices)
        element_product = user_embed * item_embed
        
        prediction = self.output_layer(element_product).squeeze()
        prediction += self.user_bias(user_indices).squeeze()
        prediction += self.item_bias(item_indices).squeeze()
        
        return torch.clamp(prediction, 1.0, 5.0)

class MLP(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, 
                 layers=[128, 64, 32], dropout=0.3):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # MLP layers
        self.mlp_layers = []
        layer_sizes = [embedding_dim * 2] + layers
        
        mlp_modules = []
        for i in range(len(layer_sizes)-1):
            mlp_modules.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
            mlp_modules.append(nn.BatchNorm1d(layer_sizes[i+1]))
            mlp_modules.append(nn.LeakyReLU(0.1))
            mlp_modules.append(nn.Dropout(dropout))
            
        self.mlp_layers = nn.Sequential(*mlp_modules)
        self.output_layer = nn.Linear(layer_sizes[-1], 1)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Initialize weights
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
        for m in self.mlp_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.output_layer.weight)
        
    def forward(self, user_indices, item_indices):
        user_embed = self.user_embedding(user_indices)
        item_embed = self.item_embedding(item_indices)
        
        vector = torch.cat([user_embed, item_embed], dim=-1)
        mlp_output = self.mlp_layers(vector)
        
        prediction = self.output_layer(mlp_output).squeeze()
        prediction += self.user_bias(user_indices).squeeze()
        prediction += self.item_bias(item_indices).squeeze()
        
        return torch.clamp(prediction, 1.0, 5.0)
    
class NeuMFPretrainedFusion(nn.Module):
    def __init__(self, gmf_model, mlp_model, alpha=0.5):
        super().__init__()
        # GMF embeddings and output layers (copied from pretrained)
        self.gmf_user_embedding = gmf_model.user_embedding
        self.gmf_item_embedding = gmf_model.item_embedding
        self.gmf_output = gmf_model.output_layer
        
        # MLP embeddings and layers (copied from pretrained)
        self.mlp_user_embedding = mlp_model.user_embedding
        self.mlp_item_embedding = mlp_model.item_embedding
        self.mlp_layers = mlp_model.mlp_layers
        self.mlp_output = mlp_model.output_layer
        
        # Bias terms
        self.user_bias = gmf_model.user_bias
        self.item_bias = gmf_model.item_bias
        
        # Fusion parameter (trainable or fixed)
        self.alpha = nn.Parameter(torch.tensor(alpha)) if isinstance(alpha, float) else alpha
        
    def forward(self, user_indices, item_indices):
        # GMF path
        gmf_user_embed = self.gmf_user_embedding(user_indices)
        gmf_item_embed = self.gmf_item_embedding(item_indices)
        gmf_vector = gmf_user_embed * gmf_item_embed
        gmf_pred = self.gmf_output(gmf_vector)
        
        # MLP path
        mlp_user_embed = self.mlp_user_embedding(user_indices)
        mlp_item_embed = self.mlp_item_embedding(item_indices)
        mlp_vector = torch.cat([mlp_user_embed, mlp_item_embed], dim=-1)
        mlp_vector = self.mlp_layers(mlp_vector)
        mlp_pred = self.mlp_output(mlp_vector)
        
        # Combine predictions with alpha weighting
        prediction = self.alpha * gmf_pred + (1 - self.alpha) * mlp_pred
        prediction = prediction.squeeze()
        
        # Add bias terms
        prediction += self.user_bias(user_indices).squeeze()
        prediction += self.item_bias(item_indices).squeeze()
        
        return torch.clamp(prediction, 1.0, 5.0)

In [5]:
def train_enhanced(model, train_df, valid_df, loader, optimizer, criterion, device, 
                  epochs=20, patience=5, clip_norm=1.0):
    best_rmse = float('inf')
    early_stop_counter = 0
    
    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-6
    )
    
    for epoch in range(1, epochs+1):
        # Training step
        model.train()
        total_loss = 0
        
        for sids, pids, ratings in loader:
            sids, pids, ratings = sids.to(device), pids.to(device), ratings.to(device)
            optimizer.zero_grad()
            preds = model(sids, pids)
            loss = criterion(preds, ratings)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            
            optimizer.step()
            total_loss += loss.item() * len(ratings)
            
        # Create prediction function for evaluation
        def pred_fn(s, p):
            model.eval()
            with torch.no_grad():
                preds = model(
                    torch.from_numpy(s).to(device), 
                    torch.from_numpy(p).to(device)
                ).detach().cpu().numpy()
            return np.clip(preds, 1, 5)
        
        # Evaluate on both train and validation sets
        train_rmse = evaluate(train_df, pred_fn)
        valid_rmse = evaluate(valid_df, pred_fn)
        
        # Learning rate scheduling
        scheduler.step(valid_rmse)
        
        print(f"Epoch {epoch:02d} — Train RMSE: {train_rmse:.4f}, Valid RMSE: {valid_rmse:.4f}, "
              f"LR: {optimizer.param_groups[0]['lr']:.6f}")
        
        if valid_rmse < best_rmse:
            best_rmse = valid_rmse
            early_stop_counter = 0
            torch.save(model.state_dict(), 'best_ncf.pth')
        else:
            early_stop_counter += 1
            
        # Early stopping
        if early_stop_counter >= patience:
            print(f"Early stopping triggered after {epoch} epochs")
            break
    
    print(f"\nBest Val RMSE: {best_rmse:.4f}")
    return best_rmse

In [6]:
def train_classic_ncf(num_users, num_items, train_loader, 
                      train_df, valid_df, device, criterion):
    print("Step 1: Training GMF model...")
    gmf_model = GMF(num_users, num_items, embedding_dim=32).to(device)
    gmf_optimizer = torch.optim.Adam(gmf_model.parameters(), lr=1e-3)
    
    train_enhanced(
        model=gmf_model,
        train_df=train_df,
        valid_df=valid_df,
        loader=train_loader,
        optimizer=gmf_optimizer,
        criterion=criterion,
        device=device,
        epochs=20,
        patience=3
    )

    torch.cuda.empty_cache()
    gc.collect()
    
    print("\nStep 2: Training MLP model...")
    mlp_model = MLP(num_users, num_items, embedding_dim=32, layers=[64, 32, 16]).to(device)
    mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-3)
    
    train_enhanced(
        model=mlp_model,
        train_df=train_df,
        valid_df=valid_df,
        loader=train_loader,
        optimizer=mlp_optimizer,
        criterion=criterion,
        device=device,
        epochs=20,
        patience=3
    )

    torch.cuda.empty_cache()
    gc.collect()
    
    # Try different alpha values or make it learnable
    print("\nStep 3: Fine-tuning combined model...")
    best_alpha = 0.5
    best_rmse = float('inf')
    
    # Option 1: Grid search alpha
    for alpha in [0.3, 0.5, 0.7]:
        print(f"Testing alpha = {alpha}")
        fusion_model = NeuMFPretrainedFusion(
            gmf_model, mlp_model, alpha=alpha).to(device)
        
        # Freeze embedding weights and only train output layers 
        for param in fusion_model.gmf_user_embedding.parameters():
            param.requires_grad = False
        for param in fusion_model.gmf_item_embedding.parameters():
            param.requires_grad = False
        for param in fusion_model.mlp_user_embedding.parameters():
            param.requires_grad = False
        for param in fusion_model.mlp_item_embedding.parameters():
            param.requires_grad = False
        
        fusion_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, fusion_model.parameters()), 
            lr=5e-4)
        
        rmse = train_enhanced(
            model=fusion_model,
            train_df=train_df,
            valid_df=valid_df,
            loader=train_loader,
            optimizer=fusion_optimizer,
            criterion=criterion,
            device=device,
            epochs=10,
            patience=3
        )
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_alpha = alpha
    
    # Option 2: Learnable alpha
    print("\nFinal model with learnable alpha...")
    fusion_model = NeuMFPretrainedFusion(
        gmf_model, mlp_model, alpha=best_alpha).to(device)
    
    # Unfreeze everything for final training
    fusion_optimizer = torch.optim.Adam(fusion_model.parameters(), lr=1e-4)
    
    final_rmse = train_enhanced(
        model=fusion_model,
        train_df=train_df,
        valid_df=valid_df,
        loader=train_loader,
        optimizer=fusion_optimizer,
        criterion=criterion,
        device=device,
        epochs=15,
        patience=5
    )
    
    return fusion_model, final_rmse

## Load and Preprocess Data

Now let's load the training and validation data, preprocess it, and prepare the data loaders for training.

In [7]:
# Load data
train_df, valid_df = read_data_df()
print(f"Train set: {len(train_df)} rows, Validation set: {len(valid_df)} rows")
print(f"Sample of training data:\n{train_df.head()}")

# Apply preprocessing
def preprocess_data(df):
    """Apply user-specific normalization to ratings"""
    # Get global mean
    global_mean = df['rating'].mean()
    print(f"Global mean rating: {global_mean:.4f}")
    
    # Get user biases (average rating deviation from global mean)
    user_biases = df.groupby('sid')['rating'].mean() - global_mean
    
    # Create a copy to avoid modifying the original dataframe
    df_norm = df.copy()
    
    # Normalize ratings by user bias
    def normalize_rating(row):
        user_id = row['sid']
        return row['rating'] - user_biases.get(user_id, 0)
    
    df_norm['rating'] = df_norm.apply(normalize_rating, axis=1)
    
    return df_norm, user_biases, global_mean

# Apply preprocessing
train_df_norm, user_biases, global_mean = preprocess_data(train_df)
valid_df_norm = valid_df.copy()
valid_df_norm['rating'] = valid_df_norm.apply(
    lambda row: row['rating'] - user_biases.get(row['sid'], 0), 
    axis=1
)

# Determine number of users and items
num_users = train_df['sid'].max() + 1
num_items = train_df['pid'].max() + 1
print(f"Num users: {num_users}, Num items: {num_items}")

# Prepare data loader with normalized data
train_loader = DataLoader(
    RatingDataset(train_df_norm), 
    batch_size=256, 
    shuffle=True, 
    num_workers=2
)

Train set: 846140 rows, Validation set: 282047 rows
Sample of training data:
         rating   sid  pid
1039945       3  9135  892
185105        4  1564  291
888332        5  7733  340
866358        4  7529  407
44674         4   368  308
Global mean rating: 3.8179
Num users: 10000, Num items: 1000


## Call the train_classic_ncf Function

In [8]:
# Define loss function
criterion = nn.MSELoss()

# Run the classic NeuMF training pipeline
fusion_model, final_rmse = train_classic_ncf(
    num_users=num_users,
    num_items=num_items,
    train_loader=train_loader,
    train_df=train_df_norm,
    valid_df=valid_df_norm,
    device=device,
    criterion=criterion
)

print(f"Final fusion model RMSE: {final_rmse:.4f}")

Step 1: Training GMF model...
Epoch 01 — Train RMSE: 2.9695, Valid RMSE: 2.9711, LR: 0.001000
Epoch 02 — Train RMSE: 2.9695, Valid RMSE: 2.9711, LR: 0.001000
Epoch 03 — Train RMSE: 2.9695, Valid RMSE: 2.9711, LR: 0.001000
Epoch 04 — Train RMSE: 2.9695, Valid RMSE: 2.9711, LR: 0.001000
Early stopping triggered after 4 epochs

Best Val RMSE: 2.9711

Step 2: Training MLP model...
Epoch 01 — Train RMSE: 0.8912, Valid RMSE: 0.9037, LR: 0.001000
Epoch 02 — Train RMSE: 0.8762, Valid RMSE: 0.8918, LR: 0.001000
Epoch 03 — Train RMSE: 0.8690, Valid RMSE: 0.8871, LR: 0.001000
Epoch 04 — Train RMSE: 0.8545, Valid RMSE: 0.8791, LR: 0.001000
Epoch 05 — Train RMSE: 0.8445, Valid RMSE: 0.8747, LR: 0.001000
Epoch 06 — Train RMSE: 0.8386, Valid RMSE: 0.8731, LR: 0.001000
Epoch 07 — Train RMSE: 0.8326, Valid RMSE: 0.8715, LR: 0.001000
Epoch 08 — Train RMSE: 0.8283, Valid RMSE: 0.8701, LR: 0.001000
Epoch 09 — Train RMSE: 0.8232, Valid RMSE: 0.8693, LR: 0.001000
Epoch 10 — Train RMSE: 0.8178, Valid RMSE: 0

## Evaluate the Final Model

Let's evaluate the final fusion model on the validation set and compare with the best RMSE value.

In [None]:
# Load the best model
fusion_model.load_state_dict(torch.load('best_ncf.pth'))

# Create prediction function
def model_pred_fn(sids, pids):
    fusion_model.eval()
    with torch.no_grad():
        preds = fusion_model(
            torch.tensor(sids, dtype=torch.long).to(device),
            torch.tensor(pids, dtype=torch.long).to(device)
        ).cpu().numpy()
    return preds

# Evaluate on validation set
val_rmse = evaluate(valid_df_norm, model_pred_fn)
print(f"Validation RMSE (normalized ratings): {val_rmse:.4f}")

# Adjust predictions by adding back user biases
def adjusted_pred_fn(sids, pids):
    # Get base predictions
    base_preds = model_pred_fn(sids, pids)
    
    # Add back user biases
    adjusted_preds = np.array([
        pred + user_biases.get(sid, 0) 
        for pred, sid in zip(base_preds, sids)
    ])
    
    # Clip to valid rating range
    return np.clip(adjusted_preds, 1, 5)

# Evaluate on validation set with adjusted predictions
val_rmse_adj = evaluate(valid_df, adjusted_pred_fn)
print(f"Validation RMSE (adjusted for user bias): {val_rmse_adj:.4f}")