In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
from typing import Callable, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [2]:
DATA_DIR = "./data"

def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.25)
    return train_df, valid_df

# Read wishlist data
def read_tbr_data() -> pd.DataFrame:
    """Reads the to-be-read (wishlist) data."""
    df = pd.read_csv(os.path.join(DATA_DIR, "train_tbr.csv"))
    # No need to split sid_pid since columns are already separate
    df[["sid", "pid"]] = df[["sid", "pid"]].astype(int)
    return df

def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)

def evaluate_implicit_model(model, data_loader, device):
    """
    Evaluate the model on the validation set
    """
    model.eval()
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for user_ids, item_ids, author_ids, venue_ids, ratings, _ in data_loader:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            author_ids = author_ids.to(device)
            venue_ids = venue_ids.to(device)
            
            predictions = model(user_ids, item_ids, author_ids, venue_ids)
            
            all_preds.append(predictions.cpu().numpy())
            all_targets.append(ratings.cpu().numpy())
    
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    
    rmse = root_mean_squared_error(all_targets, all_preds)
    return rmse

def predict_ratings(model, user_ids, item_ids, author_ids, venue_ids, device):
    """Function to make prediction for evaluation"""
    model.eval()
    with torch.no_grad():
        user_tensor = torch.from_numpy(user_ids).to(device)
        item_tensor = torch.from_numpy(item_ids).to(device)
        author_tensor = torch.from_numpy(author_ids).to(device)
        venue_tensor = torch.from_numpy(venue_ids).to(device)
        
        predictions = model(user_tensor, item_tensor, author_tensor, venue_tensor).cpu().numpy()
    
    return np.clip(predictions, 1, 5)

class RatingDataset(Dataset):
    def __init__(self, df):
        self.sids = df['sid'].values.astype(np.int64)
        self.pids = df['pid'].values.astype(np.int64)
        self.ratings = df['rating'].values.astype(np.float32)
        
    def __len__(self):
        return len(self.ratings)
        
    def __getitem__(self, idx):
        return self.sids[idx], self.pids[idx], self.ratings[idx]

    
class ImplicitFeedbackDataset(Dataset):
    """Dataset for both explicit ratings and implicit feedback"""
    def __init__(self, explicit_df, implicit_df=None, author_map=None, venue_map=None):
        self.sids = explicit_df['sid'].values.astype(np.int64)
        self.pids = explicit_df['pid'].values.astype(np.int64)
        
        # Get ratings from explicit feedback
        self.ratings = explicit_df['rating'].values.astype(np.float32)
        
        # Get implicit feedback (1 for items in wishlist, 0 otherwise)
        if implicit_df is not None:
            # Create mapping of (sid, pid) to implicit feedback
            implicit_map = {(row['sid'], row['pid']): 1 for _, row in implicit_df.iterrows()}
            
            # Get implicit feedback for each (sid, pid) pair in explicit_df
            self.implicit_feedback = np.array([
                implicit_map.get((sid, pid), 0) 
                for sid, pid in zip(self.sids, self.pids)
            ], dtype=np.float32)
        else:
            # If no implicit feedback, use zeros
            self.implicit_feedback = np.zeros_like(self.ratings)
            
        # Get author and venue information if provided
        if author_map is not None:
            self.author_ids = np.array([author_map.get(pid, 0) for pid in self.pids], dtype=np.int64)
        else:
            self.author_ids = np.zeros_like(self.pids)
            
        if venue_map is not None:
            self.venue_ids = np.array([venue_map.get(pid, 0) for pid in self.pids], dtype=np.int64)
        else:
            self.venue_ids = np.zeros_like(self.pids)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return (
            self.sids[idx], 
            self.pids[idx], 
            self.author_ids[idx], 
            self.venue_ids[idx],
            self.ratings[idx], 
            self.implicit_feedback[idx]
        )

class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, mf_dim=32, mlp_layer_sizes=[64,32,16,8], dropout=0.2):
        super().__init__()
        # GMF embeddings
        self.user_gmf = nn.Embedding(num_users, mf_dim)
        self.item_gmf = nn.Embedding(num_items, mf_dim)
        # MLP embeddings
        self.user_mlp = nn.Embedding(num_users, mlp_layer_sizes[0])
        self.item_mlp = nn.Embedding(num_items, mlp_layer_sizes[0])
        
        # MLP layers
        mlp_layers = []
        in_size = mlp_layer_sizes[0] * 2
        for out_size in mlp_layer_sizes[1:]:
            mlp_layers += [nn.Dropout(dropout), nn.Linear(in_size, out_size), nn.ReLU()]
            in_size = out_size
        self.mlp = nn.Sequential(*mlp_layers)
        
        # Final prediction
        self.predict = nn.Linear(mf_dim + mlp_layer_sizes[-1], 1)
        
        # Initialization
        nn.init.normal_(self.user_gmf.weight, std=0.01)
        nn.init.normal_(self.item_gmf.weight, std=0.01)
        nn.init.normal_(self.user_mlp.weight, std=0.01)
        nn.init.normal_(self.item_mlp.weight, std=0.01)
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.predict.weight, a=1, nonlinearity='sigmoid')

    def forward(self, u, i):
        gmf = self.user_gmf(u) * self.item_gmf(i)
        mlp = self.mlp(torch.cat([self.user_mlp(u), self.item_mlp(i)], dim=1))
        x = torch.cat([gmf, mlp], dim=1)
        return self.predict(x).squeeze()

class EnhancedNeuMF(nn.Module):
    def __init__(self, num_users, num_items, mf_dim=64, mlp_layer_sizes=[128,64,32], dropout=0.3):
        super().__init__()
        # GMF embeddings
        self.user_gmf = nn.Embedding(num_users, mf_dim)
        self.item_gmf = nn.Embedding(num_items, mf_dim)
        # MLP embeddings
        self.user_mlp = nn.Embedding(num_users, mlp_layer_sizes[0])
        self.item_mlp = nn.Embedding(num_items, mlp_layer_sizes[0])
        
        # Add user and item bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Improved MLP with BatchNorm
        mlp_layers = []
        in_size = mlp_layer_sizes[0] * 2
        for out_size in mlp_layer_sizes[1:]:
            mlp_layers += [
                nn.Linear(in_size, out_size),
                nn.BatchNorm1d(out_size),
                nn.LeakyReLU(0.1),
                nn.Dropout(dropout)
            ]
            in_size = out_size
        self.mlp = nn.Sequential(*mlp_layers)
        
        # Final prediction
        self.predict = nn.Linear(mf_dim + mlp_layer_sizes[-1], 1)
        
        # Initialization
        nn.init.normal_(self.user_gmf.weight, std=0.01)
        nn.init.normal_(self.item_gmf.weight, std=0.01)
        nn.init.normal_(self.user_mlp.weight, std=0.01)
        nn.init.normal_(self.item_mlp.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.predict.weight, a=1, nonlinearity='sigmoid')

    def forward(self, u, i):
        # GMF path
        gmf = self.user_gmf(u) * self.item_gmf(i)
        # MLP path
        mlp = self.mlp(torch.cat([self.user_mlp(u), self.item_mlp(i)], dim=1))
        # Combine paths
        x = torch.cat([gmf, mlp], dim=1)
        base_pred = self.predict(x).squeeze()
        # Add bias terms
        u_bias = self.user_bias(u).squeeze()
        i_bias = self.item_bias(i).squeeze()
        return base_pred + u_bias + i_bias

class ImplicitEnhancedNeuMF(nn.Module):
    def __init__(
        self, 
        num_users, 
        num_items, 
        num_authors,
        num_venues, 
        embedding_dim=64, 
        mlp_dims=[128, 64, 32], 
        dropout_rate=0.3,
        implicit_weight=0.5  # Weight for implicit feedback contribution
    ):
        super(ImplicitEnhancedNeuMF, self).__init__()
        
        # Explicit feedback pathway
        self.user_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_gmf = nn.Embedding(num_items, embedding_dim)
        self.user_mlp = nn.Embedding(num_users, mlp_dims[0])
        self.item_mlp = nn.Embedding(num_items, mlp_dims[0])
        
        # Bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Explicit MLP pathway
        explicit_mlp_layers = []
        in_size = mlp_dims[0] * 2
        for out_size in mlp_dims[1:]:
            explicit_mlp_layers += [
                nn.Linear(in_size, out_size),
                nn.BatchNorm1d(out_size),
                nn.LeakyReLU(0.1),
                nn.Dropout(dropout_rate)
            ]
            in_size = out_size
        self.explicit_mlp = nn.Sequential(*explicit_mlp_layers)
        
        # Final prediction for explicit pathway
        self.explicit_predict = nn.Linear(embedding_dim + mlp_dims[-1], 1)
        
        # Implicit feedback pathway
        self.user_implicit_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_implicit_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Author and venue embeddings for metadata
        self.author_embedding = nn.Embedding(num_authors, embedding_dim)
        self.venue_embedding = nn.Embedding(num_venues, embedding_dim)
        
        # MLP for implicit feedback with metadata
        implicit_input_dim = embedding_dim * 4  # User, item, author, venue
        implicit_mlp_layers = []
        in_size = implicit_input_dim
        for out_size in mlp_dims:
            implicit_mlp_layers += [
                nn.Linear(in_size, out_size),
                nn.BatchNorm1d(out_size),
                nn.LeakyReLU(0.1),
                nn.Dropout(dropout_rate)
            ]
            in_size = out_size
        implicit_mlp_layers.append(nn.Linear(in_size, 1))
        self.implicit_mlp = nn.Sequential(*implicit_mlp_layers)
        
        # Integration layer
        self.integration_layer = nn.Linear(2, 1)
        
        # Weight for balancing explicit and implicit signals
        self.implicit_weight = implicit_weight
        
        # Initialization
        self._init_weights()
        
    def _init_weights(self):
        # Initialize embeddings
        for module in [self.user_gmf, self.item_gmf, self.user_mlp, self.item_mlp,
                     self.user_implicit_embedding, self.item_implicit_embedding,
                     self.author_embedding, self.venue_embedding]:
            nn.init.normal_(module.weight, mean=0, std=0.01)
        
        # Initialize bias
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
        # Initialize linear layers
        for m in self.explicit_mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                
        for i, m in enumerate(self.implicit_mlp):
            if isinstance(m, nn.Linear):
                if i == len(self.implicit_mlp) - 1:  # Output layer
                    nn.init.kaiming_uniform_(m.weight, a=1, nonlinearity='sigmoid')
                else:
                    nn.init.xavier_uniform_(m.weight)
        
        nn.init.xavier_uniform_(self.explicit_predict.weight)
        nn.init.xavier_uniform_(self.integration_layer.weight)
        nn.init.zeros_(self.integration_layer.bias)
        
    def forward(self, user_indices, item_indices, author_indices, venue_indices):
        # Explicit feedback pathway (similar to EnhancedNeuMF)
        gmf = self.user_gmf(user_indices) * self.item_gmf(item_indices)
        mlp = self.explicit_mlp(torch.cat([self.user_mlp(user_indices), self.item_mlp(item_indices)], dim=1))
        
        # Combine GMF and MLP for explicit prediction
        explicit_concat = torch.cat([gmf, mlp], dim=1)
        explicit_score = self.explicit_predict(explicit_concat).squeeze()
        
        # Add bias terms to explicit score
        u_bias = self.user_bias(user_indices).squeeze()
        i_bias = self.item_bias(item_indices).squeeze()
        explicit_score = explicit_score + u_bias + i_bias
        
        # Implicit feedback pathway
        user_implicit = self.user_implicit_embedding(user_indices)
        item_implicit = self.item_implicit_embedding(item_indices)
        author_embed = self.author_embedding(author_indices)
        venue_embed = self.venue_embedding(venue_indices)
        
        # Process implicit feedback with metadata
        implicit_concat = torch.cat([user_implicit, item_implicit, author_embed, venue_embed], dim=1)
        implicit_score = self.implicit_mlp(implicit_concat).squeeze()
        
        # Combine explicit and implicit scores
        combined_scores = torch.cat([
            explicit_score.unsqueeze(1), 
            implicit_score.unsqueeze(1)
        ], dim=1)
        
        final_score = self.integration_layer(combined_scores).squeeze()
        
        return torch.clamp(final_score, 1.0, 5.0)
    
    def predict(self, user_indices, item_indices, author_indices, venue_indices):
        return self.forward(user_indices, item_indices, author_indices, venue_indices)


class AttentionNeuMF(nn.Module):
    def __init__(self, num_users, num_items, mf_dim=64, mlp_layer_sizes=[128,64,32], dropout=0.3):
        super().__init__()
        # GMF embeddings
        self.user_gmf = nn.Embedding(num_users, mf_dim)
        self.item_gmf = nn.Embedding(num_items, mf_dim)
        # MLP embeddings
        self.user_mlp = nn.Embedding(num_users, mlp_layer_sizes[0])
        self.item_mlp = nn.Embedding(num_items, mlp_layer_sizes[0])
        
        # Add user and item bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Improved MLP with BatchNorm
        mlp_layers = []
        in_size = mlp_layer_sizes[0] * 2
        for out_size in mlp_layer_sizes[1:]:
            mlp_layers += [
                nn.Linear(in_size, out_size),
                nn.BatchNorm1d(out_size),
                nn.LeakyReLU(0.1),
                nn.Dropout(dropout)
            ]
            in_size = out_size
        self.mlp = nn.Sequential(*mlp_layers)
        
        # Attention mechanism
        self.attention = nn.Linear(mf_dim + mlp_layer_sizes[-1], 2)
        
        # Final prediction
        self.predict = nn.Linear(mf_dim + mlp_layer_sizes[-1], 1)
        
        # Initialize weights
        self._init_weights()
        
    def _init_weights(self):
        nn.init.normal_(self.user_gmf.weight, std=0.01)
        nn.init.normal_(self.item_gmf.weight, std=0.01)
        nn.init.normal_(self.user_mlp.weight, std=0.01)
        nn.init.normal_(self.item_mlp.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        nn.init.xavier_uniform_(self.attention.weight)
        nn.init.kaiming_uniform_(self.predict.weight, a=1, nonlinearity='sigmoid')
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, u, i):
        # GMF path
        gmf = self.user_gmf(u) * self.item_gmf(i)
        # MLP path
        mlp = self.mlp(torch.cat([self.user_mlp(u), self.item_mlp(i)], dim=1))
        
        # Concatenate for attention
        concat = torch.cat([gmf, mlp], dim=1)
        
        # Attention weights
        att_weights = F.softmax(self.attention(concat), dim=1)
        
        # Apply attention weights
        weighted_gmf = gmf * att_weights[:, 0:1]
        weighted_mlp = mlp * att_weights[:, 1:2]
        
        # Combine for prediction
        combined = torch.cat([weighted_gmf, weighted_mlp], dim=1)
        
        # Get prediction with bias terms
        base_pred = self.predict(combined).squeeze()
        u_bias = self.user_bias(u).squeeze()
        i_bias = self.item_bias(i).squeeze()
        
        return base_pred + u_bias + i_bias

class ImprovedNeuMF(nn.Module):
    def __init__(
        self,
        num_users: int,
        num_items: int,
        mf_dim: int = 64,
        mlp_dim: int = 128,
        mlp_layers: list = [128, 64, 32],
        dropout: float = 0.3
    ):
        super().__init__()
        # Separate embeddings for MF and MLP paths (from NeuMFConcat)
        self.user_emb_mf = nn.Embedding(num_users, mf_dim)
        self.item_emb_mf = nn.Embedding(num_items, mf_dim)
        self.user_emb_mlp = nn.Embedding(num_users, mlp_dim)
        self.item_emb_mlp = nn.Embedding(num_items, mlp_dim)
        
        # Bias terms (from EnhancedNeuMF)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # MLP tower with BatchNorm and LeakyReLU
        layers = []
        in_size = mlp_dim * 2
        for out_size in mlp_layers:
            layers += [
                nn.Linear(in_size, out_size),
                nn.BatchNorm1d(out_size),
                nn.LeakyReLU(0.1),
                nn.Dropout(dropout)
            ]
            in_size = out_size
        self.mlp = nn.Sequential(*layers)
        
        # Final fusion layer
        self.fusion = nn.Linear(mf_dim + mlp_layers[-1], 1)
        
        # Proper initialization
        self._init_weights()
    
    def _init_weights(self):
        # Initialize embeddings
        nn.init.normal_(self.user_emb_mf.weight, std=0.01)
        nn.init.normal_(self.item_emb_mf.weight, std=0.01)
        nn.init.normal_(self.user_emb_mlp.weight, std=0.01)
        nn.init.normal_(self.item_emb_mlp.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        # Initialize MLP and fusion layers
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.fusion.weight, a=0.01, nonlinearity='sigmoid')
    
    def forward(self, user_idx, item_idx):
        # MF path
        mf_u = self.user_emb_mf(user_idx)
        mf_i = self.item_emb_mf(item_idx)
        mf_vector = mf_u * mf_i  # element-wise multiplication
        
        # MLP path
        mlp_u = self.user_emb_mlp(user_idx)
        mlp_i = self.item_emb_mlp(item_idx)
        mlp_vector = torch.cat([mlp_u, mlp_i], dim=1)
        mlp_vector = self.mlp(mlp_vector)
        
        # Fusion
        concat = torch.cat([mf_vector, mlp_vector], dim=1)
        prediction = self.fusion(concat).squeeze()
        
        # Add bias terms
        u_bias = self.user_bias(user_idx).squeeze()
        i_bias = self.item_bias(item_idx).squeeze()
        
        return prediction + u_bias + i_bias

class NeuMFRegressor(nn.Module):
    def __init__(self, num_users, num_items, mf_dim=32, mlp_sizes=[128,64,32],
                 gm_dropout=0.2, mlp_dropout=0.3, min_rating=1.0, max_rating=5.0):
        super().__init__()
        # GMF branch
        self.user_gmf      = nn.Embedding(num_users, mf_dim)
        self.item_gmf      = nn.Embedding(num_items, mf_dim)
        self.gmf_dropout   = nn.Dropout(gm_dropout)
        # MLP branch
        self.user_mlp      = nn.Embedding(num_users, mlp_sizes[0])
        self.item_mlp      = nn.Embedding(num_items, mlp_sizes[0])
        layers = []
        in_dim = mlp_sizes[0]*2
        for d in mlp_sizes[1:]:
            layers += [nn.Linear(in_dim, d),
                       nn.LayerNorm(d),
                       nn.LeakyReLU(0.1),
                       nn.Dropout(mlp_dropout)]
            in_dim = d
        self.mlp = nn.Sequential(*layers)
        # fusion gate, final predict, biases
        self.alpha         = nn.Parameter(torch.tensor(0.5))
        self.predict       = nn.Linear(mf_dim + mlp_sizes[-1], 1)
        self.user_bias     = nn.Embedding(num_users, 1)
        self.item_bias     = nn.Embedding(num_items, 1)
        self.global_bias   = nn.Parameter(torch.zeros(1))
        self.min_rating    = min_rating
        self.max_rating    = max_rating

        # Xavier init
        for emb in [self.user_gmf, self.item_gmf,
                    self.user_mlp, self.item_mlp]:
            nn.init.xavier_uniform_(emb.weight)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        nn.init.xavier_uniform_(self.predict.weight)

    def forward(self, u, i):
        g = self.gmf_dropout(self.user_gmf(u) * self.item_gmf(i))
        m = self.mlp(torch.cat([self.user_mlp(u), self.item_mlp(i)], 1))
        x = torch.cat([self.alpha*g, (1-self.alpha)*m], 1)
        base = self.predict(x).squeeze()
        out  = base \
             + self.user_bias(u).squeeze() \
             + self.item_bias(i).squeeze() \
             + self.global_bias
        return torch.clamp(out, self.min_rating, self.max_rating)


def preprocess_data(df):
    """Apply user-specific normalization to ratings"""
    # Get global mean
    global_mean = df['rating'].mean()
    print(f"Global mean rating: {global_mean:.4f}")
    
    # Get user biases (average rating deviation from global mean)
    user_biases = df.groupby('sid')['rating'].mean() - global_mean
    
    # Create a copy to avoid modifying the original dataframe
    df_norm = df.copy()
    
    # Normalize ratings by user bias
    def normalize_rating(row):
        user_id = row['sid']
        return row['rating'] - user_biases.get(user_id, 0)
    
    df_norm['rating'] = df_norm.apply(normalize_rating, axis=1)
    
    return df_norm, user_biases, global_mean

def prepare_implicit_feedback(ratings_df, tbr_df, threshold=3.5):
    """
    Prepare implicit feedback from both ratings and TBR datasets
    
    Args:
        ratings_df: DataFrame with explicit ratings
        tbr_df: DataFrame with to-be-read items
        threshold: Rating threshold to consider as positive feedback (default: 3.5)
        
    Returns:
        DataFrame with user_id, item_id, and implicit feedback
    """
    # Get positive implicit feedback from high ratings
    positive_ratings = ratings_df[ratings_df['rating'] >= threshold][['sid', 'pid']]
    positive_ratings['implicit'] = 1
    
    # Get implicit feedback from TBR items
    tbr_implicit = tbr_df[['sid', 'pid']].copy()
    tbr_implicit['implicit'] = 1
    
    # Combine both sources and remove duplicates
    implicit_feedback = pd.concat([positive_ratings, tbr_implicit]).drop_duplicates(['sid', 'pid'])
    
    return implicit_feedback

def get_metadata_maps(ratings_df):
    """
    Create author and venue mappings from item IDs
    This is a placeholder - in a real scenario, you'd load actual metadata
    
    Returns:
        author_map, venue_map: Dictionaries mapping item IDs to author/venue IDs
    """
    # In a real scenario, you'd load actual metadata
    # Here we're just creating placeholders based on item IDs
    items = ratings_df['pid'].unique()
    
    # Assign random author and venue IDs
    # In a real scenario, these would be actual mappings
    author_map = {pid: np.random.randint(1, 100) for pid in items}
    venue_map = {pid: np.random.randint(1, 50) for pid in items}
    
    return author_map, venue_map

def train(model, train_df, valid_df, loader, optimizer, criterion, device, epochs=20):
    best_rmse = float('inf')
    
    for epoch in range(1, epochs+1):
        # Training step
        model.train()
        total_loss = 0
        
        for sids, pids, ratings in loader:
            sids, pids, ratings = sids.to(device), pids.to(device), ratings.to(device)
            optimizer.zero_grad()
            preds = model(sids, pids)
            loss = criterion(preds, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(ratings)
            
        # Create prediction function for evaluation
        def pred_fn(s, p):
            model.eval()
            with torch.no_grad():
                preds = model(
                    torch.from_numpy(s).to(device), 
                    torch.from_numpy(p).to(device)
                ).detach().cpu().numpy()
            return np.clip(preds, 1, 5)
        
        # Evaluate on both train and validation sets
        train_rmse = evaluate(train_df, pred_fn)
        valid_rmse = evaluate(valid_df, pred_fn)
        
        # Learning rate scheduling (if you have a scheduler)
        # scheduler.step(valid_rmse)
        
        print(f"Epoch {epoch:02d} — Train RMSE: {train_rmse:.4f}, Valid RMSE: {valid_rmse:.4f}")
        
        if valid_rmse < best_rmse:
            best_rmse = valid_rmse
            torch.save(model.state_dict(), 'best_ncf.pth')
    
    print(f"\nBest Val RMSE: {best_rmse:.4f}")
    return best_rmse

def train_enhanced(model, train_df, valid_df, loader, optimizer, criterion, device, 
                  epochs=20, patience=5, clip_norm=1.0):
    best_rmse = float('inf')
    early_stop_counter = 0
    
    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-6
    )
    
    for epoch in range(1, epochs+1):
        # Training step
        model.train()
        total_loss = 0
        
        for sids, pids, ratings in loader:
            sids, pids, ratings = sids.to(device), pids.to(device), ratings.to(device)
            optimizer.zero_grad()
            preds = model(sids, pids)
            loss = criterion(preds, ratings)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            
            optimizer.step()
            total_loss += loss.item() * len(ratings)
            
        # Create prediction function for evaluation
        def pred_fn(s, p):
            model.eval()
            with torch.no_grad():
                preds = model(
                    torch.from_numpy(s).to(device), 
                    torch.from_numpy(p).to(device)
                ).detach().cpu().numpy()
            return np.clip(preds, 1, 5)
        
        # Evaluate on both train and validation sets
        train_rmse = evaluate(train_df, pred_fn)
        valid_rmse = evaluate(valid_df, pred_fn)
        
        # Learning rate scheduling
        scheduler.step(valid_rmse)
        
        print(f"Epoch {epoch:02d} — Train RMSE: {train_rmse:.4f}, Valid RMSE: {valid_rmse:.4f}, "
              f"LR: {optimizer.param_groups[0]['lr']:.6f}")
        
        if valid_rmse < best_rmse:
            best_rmse = valid_rmse
            early_stop_counter = 0
            torch.save(model.state_dict(), 'best_ncf.pth')
        else:
            early_stop_counter += 1
            
        # Early stopping
        if early_stop_counter >= patience:
            print(f"Early stopping triggered after {epoch} epochs")
            break
    
    print(f"\nBest Val RMSE: {best_rmse:.4f}")
    return best_rmse

def train_implicit_model(model, train_loader, valid_loader, optimizer, device, 
                        explicit_criterion=nn.MSELoss(), 
                        implicit_criterion=nn.BCEWithLogitsLoss(),
                        epochs=20, patience=5):
    """
    Training function for the implicit feedback enhanced model
    """
    best_rmse = float('inf')
    early_stop_counter = 0
    
    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-6
    )
    
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0
        num_batches = 0
        
        for user_ids, item_ids, author_ids, venue_ids, ratings, implicit_feedback in train_loader:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            author_ids = author_ids.to(device)
            venue_ids = venue_ids.to(device)
            ratings = ratings.to(device)
            implicit_feedback = implicit_feedback.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            predictions = model(user_ids, item_ids, author_ids, venue_ids)
            
            # Calculate explicit loss (MSE for ratings)
            explicit_loss = explicit_criterion(predictions, ratings)
            
            # Overall loss is primarily MSE rating loss
            loss = explicit_loss
            
            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
        
        # Validation
        model.eval()
        val_rmse = evaluate_implicit_model(model, valid_loader, device)
        
        # Update learning rate
        scheduler.step(val_rmse)
        
        # Print progress
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch}/{epochs} - Train Loss: {avg_loss:.4f}, Val RMSE: {val_rmse:.4f}, "
              f"LR: {optimizer.param_groups[0]['lr']:.6f}")
        
        # Check for improvement
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            early_stop_counter = 0
            torch.save(model.state_dict(), 'best_implicit_model.pth')
            print(f"Model improved - saving checkpoint (RMSE: {best_rmse:.4f})")
        else:
            early_stop_counter += 1
            print(f"No improvement for {early_stop_counter} epochs")
        
        # Early stopping
        if early_stop_counter >= patience:
            print(f"Early stopping triggered after {epoch} epochs")
            break
    
    # Load best model
    model.load_state_dict(torch.load('best_implicit_model.pth'))
    return model, best_rmse

def create_ensemble(num_users, num_items, device):
    """Create an ensemble of models with different configurations"""
    models = []
    
    # Config 1: Default with more capacity
    model1 = EnhancedNeuMF(
        num_users, num_items, 
        mf_dim=64, 
        mlp_layer_sizes=[128, 64, 32], 
        dropout=0.3
    ).to(device)
    
    # Config 2: Smaller with less dropout
    model2 = EnhancedNeuMF(
        num_users, num_items, 
        mf_dim=32, 
        mlp_layer_sizes=[64, 32, 16], 
        dropout=0.2
    ).to(device)
    
    # Config 3: Attention-based model
    model3 = AttentionNeuMF(
        num_users, num_items,
        mf_dim=64,
        mlp_layer_sizes=[128, 64, 32],
        dropout=0.3
    ).to(device)
    
    return [model1, model2, model3]

def ensemble_predict(models, sids, pids, device):
    """Make predictions with an ensemble of models"""
    predictions = []
    
    for model in models:
        model.eval()
        with torch.no_grad():
            s_tensor = torch.from_numpy(sids).to(device) 
            p_tensor = torch.from_numpy(pids).to(device)
            preds = model(s_tensor, p_tensor).detach().cpu().numpy()
        predictions.append(preds)
        
    # Average predictions
    ensemble_preds = np.mean(predictions, axis=0)
    return np.clip(ensemble_preds, 1, 5)

In [3]:
# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [4]:
# Load ratings data
train_df, valid_df = read_data_df()

In [6]:
# Load tbr data
tbr_data = read_tbr_data()

In [6]:
# Apply preprocessing
train_df_norm, user_biases, global_mean = preprocess_data(train_df)
valid_df_norm = valid_df.copy()
valid_df_norm['rating'] = valid_df_norm.apply(
    lambda row: row['rating'] - user_biases.get(row['sid'], 0), 
    axis=1
)

# Determine number of users and items
num_users = train_df['sid'].max() + 1
num_items = train_df['pid'].max() + 1
print(f"Num users: {num_users}, Num items: {num_items}")

# Prepare data loader with normalized data
train_loader = DataLoader(
    RatingDataset(train_df_norm), 
    batch_size=1024, 
    shuffle=True, 
    num_workers=4
)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Global mean rating: 3.8179
Num users: 10000, Num items: 1000
Using device: cpu


In [None]:
# Use the enhanced model
model = EnhancedNeuMF(num_users, num_items).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4)
criterion = nn.MSELoss()

# Train with the enhanced training function
best_rmse = train_enhanced(
    model=model,
    train_df=train_df_norm,
    valid_df=valid_df_norm,
    loader=train_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    epochs=20,
    patience=5
)

Epoch 01 — Train RMSE: 0.9083, Valid RMSE: 0.9246, LR: 0.000500
Epoch 02 — Train RMSE: 0.8742, Valid RMSE: 0.8956, LR: 0.000500
Epoch 03 — Train RMSE: 0.8457, Valid RMSE: 0.8806, LR: 0.000500
Epoch 04 — Train RMSE: 0.8222, Valid RMSE: 0.8748, LR: 0.000500
Epoch 05 — Train RMSE: 0.7979, Valid RMSE: 0.8710, LR: 0.000500
Epoch 06 — Train RMSE: 0.7726, Valid RMSE: 0.8696, LR: 0.000500
Epoch 07 — Train RMSE: 0.7460, Valid RMSE: 0.8707, LR: 0.000500
Epoch 08 — Train RMSE: 0.7171, Valid RMSE: 0.8733, LR: 0.000500
Epoch 09 — Train RMSE: 0.6878, Valid RMSE: 0.8786, LR: 0.000500
Epoch 10 — Train RMSE: 0.6550, Valid RMSE: 0.8850, LR: 0.000250
Epoch 11 — Train RMSE: 0.6279, Valid RMSE: 0.8889, LR: 0.000250
Early stopping triggered after 11 epochs

Best Val RMSE: 0.8696


In [None]:
model = NeuMFRegressor(num_users, num_items).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4)
criterion = nn.MSELoss()

best_rmse = train_enhanced(
    model=model,
    train_df=train_df_norm,
    valid_df=valid_df_norm,
    loader=train_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    epochs=30,
    patience=5
)

Epoch 01 — Train RMSE: 0.8773, Valid RMSE: 0.8931, LR: 0.000500
Epoch 02 — Train RMSE: 0.8653, Valid RMSE: 0.8883, LR: 0.000500
Epoch 03 — Train RMSE: 0.8364, Valid RMSE: 0.8806, LR: 0.000500
Epoch 04 — Train RMSE: 0.7805, Valid RMSE: 0.8730, LR: 0.000500
Epoch 05 — Train RMSE: 0.7258, Valid RMSE: 0.8805, LR: 0.000500
Epoch 06 — Train RMSE: 0.6917, Valid RMSE: 0.8916, LR: 0.000500
Epoch 07 — Train RMSE: 0.6689, Valid RMSE: 0.9014, LR: 0.000500


: 

In [8]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, 1)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Initialize weights
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        nn.init.kaiming_uniform_(self.output_layer.weight)
        
    def forward(self, user_indices, item_indices):
        user_embed = self.user_embedding(user_indices)
        item_embed = self.item_embedding(item_indices)
        element_product = user_embed * item_embed
        
        prediction = self.output_layer(element_product).squeeze()
        prediction += self.user_bias(user_indices).squeeze()
        prediction += self.item_bias(item_indices).squeeze()
        
        return torch.clamp(prediction, 1.0, 5.0)

class MLP(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, 
                 layers=[128, 64, 32], dropout=0.3):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # MLP layers
        self.mlp_layers = []
        layer_sizes = [embedding_dim * 2] + layers
        
        mlp_modules = []
        for i in range(len(layer_sizes)-1):
            mlp_modules.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
            mlp_modules.append(nn.BatchNorm1d(layer_sizes[i+1]))
            mlp_modules.append(nn.LeakyReLU(0.1))
            mlp_modules.append(nn.Dropout(dropout))
            
        self.mlp_layers = nn.Sequential(*mlp_modules)
        self.output_layer = nn.Linear(layer_sizes[-1], 1)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Initialize weights
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
        for m in self.mlp_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.output_layer.weight)
        
    def forward(self, user_indices, item_indices):
        user_embed = self.user_embedding(user_indices)
        item_embed = self.item_embedding(item_indices)
        
        vector = torch.cat([user_embed, item_embed], dim=-1)
        mlp_output = self.mlp_layers(vector)
        
        prediction = self.output_layer(mlp_output).squeeze()
        prediction += self.user_bias(user_indices).squeeze()
        prediction += self.item_bias(item_indices).squeeze()
        
        return torch.clamp(prediction, 1.0, 5.0)
    
class NeuMFPretrainedFusion(nn.Module):
    def __init__(self, gmf_model, mlp_model, alpha=0.5):
        super().__init__()
        # GMF embeddings and output layers (copied from pretrained)
        self.gmf_user_embedding = gmf_model.user_embedding
        self.gmf_item_embedding = gmf_model.item_embedding
        self.gmf_output = gmf_model.output_layer
        
        # MLP embeddings and layers (copied from pretrained)
        self.mlp_user_embedding = mlp_model.user_embedding
        self.mlp_item_embedding = mlp_model.item_embedding
        self.mlp_layers = mlp_model.mlp_layers
        self.mlp_output = mlp_model.output_layer
        
        # Bias terms
        self.user_bias = gmf_model.user_bias
        self.item_bias = gmf_model.item_bias
        
        # Fusion parameter (trainable or fixed)
        self.alpha = nn.Parameter(torch.tensor(alpha)) if isinstance(alpha, float) else alpha
        
    def forward(self, user_indices, item_indices):
        # GMF path
        gmf_user_embed = self.gmf_user_embedding(user_indices)
        gmf_item_embed = self.gmf_item_embedding(user_indices)
        gmf_vector = gmf_user_embed * gmf_item_embed
        gmf_pred = self.gmf_output(gmf_vector)
        
        # MLP path
        mlp_user_embed = self.mlp_user_embedding(user_indices)
        mlp_item_embed = self.mlp_item_embedding(item_indices)
        mlp_vector = torch.cat([mlp_user_embed, mlp_item_embed], dim=-1)
        mlp_vector = self.mlp_layers(mlp_vector)
        mlp_pred = self.mlp_output(mlp_vector)
        
        # Combine predictions with alpha weighting
        prediction = self.alpha * gmf_pred + (1 - self.alpha) * mlp_pred
        prediction = prediction.squeeze()
        
        # Add bias terms
        prediction += self.user_bias(user_indices).squeeze()
        prediction += self.item_bias(item_indices).squeeze()
        
        return torch.clamp(prediction, 1.0, 5.0)
    
def train_classic_ncf(num_users, num_items, train_loader, 
                      train_df, valid_df, device, criterion):
    print("Step 1: Training GMF model...")
    gmf_model = GMF(num_users, num_items).to(device)
    gmf_optimizer = torch.optim.Adam(gmf_model.parameters(), lr=1e-3)
    
    train_enhanced(
        model=gmf_model,
        train_df=train_df,
        valid_df=valid_df,
        loader=train_loader,
        optimizer=gmf_optimizer,
        criterion=criterion,
        device=device,
        epochs=20,
        patience=3
    )
    
    print("\nStep 2: Training MLP model...")
    mlp_model = MLP(num_users, num_items).to(device)
    mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-3)
    
    train_enhanced(
        model=mlp_model,
        train_df=train_df,
        valid_df=valid_df,
        loader=train_loader,
        optimizer=mlp_optimizer,
        criterion=criterion,
        device=device,
        epochs=20,
        patience=3
    )
    
    # Try different alpha values or make it learnable
    print("\nStep 3: Fine-tuning combined model...")
    best_alpha = 0.5
    best_rmse = float('inf')
    
    # Option 1: Grid search alpha
    for alpha in [0.3, 0.5, 0.7]:
        print(f"Testing alpha = {alpha}")
        fusion_model = NeuMFPretrainedFusion(
            gmf_model, mlp_model, alpha=alpha).to(device)
        
        # Freeze embedding weights and only train output layers 
        for param in fusion_model.gmf_user_embedding.parameters():
            param.requires_grad = False
        for param in fusion_model.gmf_item_embedding.parameters():
            param.requires_grad = False
        for param in fusion_model.mlp_user_embedding.parameters():
            param.requires_grad = False
        for param in fusion_model.mlp_item_embedding.parameters():
            param.requires_grad = False
        
        fusion_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, fusion_model.parameters()), 
            lr=5e-4)
        
        rmse = train_enhanced(
            model=fusion_model,
            train_df=train_df,
            valid_df=valid_df,
            loader=train_loader,
            optimizer=fusion_optimizer,
            criterion=criterion,
            device=device,
            epochs=10,
            patience=3
        )
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_alpha = alpha
    
    # Option 2: Learnable alpha
    print("\nFinal model with learnable alpha...")
    fusion_model = NeuMFPretrainedFusion(
        gmf_model, mlp_model, alpha=best_alpha).to(device)
    
    # Unfreeze everything for final training
    fusion_optimizer = torch.optim.Adam(fusion_model.parameters(), lr=1e-4)
    
    final_rmse = train_enhanced(
        model=fusion_model,
        train_df=train_df,
        valid_df=valid_df,
        loader=train_loader,
        optimizer=fusion_optimizer,
        criterion=criterion,
        device=device,
        epochs=15,
        patience=5
    )
    
    return fusion_model, final_rmse

In [7]:
# Prepare implicit feedback data
implicit_data = prepare_implicit_feedback(train_df, tbr_data, threshold=4.0)

# Get metadata maps (placeholder function - would use real metadata in production)
author_map, venue_map = get_metadata_maps(train_df)

# Determine dimensions
num_users = max(train_df['sid'].max(), tbr_data['sid'].max()) + 1
num_items = max(train_df['pid'].max(), tbr_data['pid'].max()) + 1
num_authors = max(author_map.values()) + 1
num_venues = max(venue_map.values()) + 1

# Print some info
print(f"Dataset dimensions:")
print(f"  Users: {num_users}")
print(f"  Items: {num_items}")
print(f"  Authors: {num_authors}")
print(f"  Venues: {num_venues}")
print(f"  Training ratings: {len(train_df)}")
print(f"  Validation ratings: {len(valid_df)}")
print(f"  TBR (wishlist) items: {len(tbr_data)}")
print(f"  Combined implicit feedback points: {len(implicit_data)}")

# Create datasets
train_dataset = ImplicitFeedbackDataset(
    train_df, 
    implicit_data, 
    author_map, 
    venue_map
)

valid_dataset = ImplicitFeedbackDataset(
    valid_df,
    implicit_data,
    author_map,
    venue_map
)

# Create data loaders
batch_size = 1024
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=4
)

valid_loader = DataLoader(
    valid_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=4
)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize model
model = ImplicitEnhancedNeuMF(
    num_users=num_users,
    num_items=num_items,
    num_authors=num_authors,
    num_venues=num_venues,
    embedding_dim=64,
    mlp_dims=[128, 64, 32],
    dropout_rate=0.3,
    implicit_weight=0.5
).to(device)

# Setup optimizer and loss functions
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
explicit_criterion = nn.MSELoss()
implicit_criterion = nn.BCEWithLogitsLoss()

Dataset dimensions:
  Users: 10000
  Items: 1000
  Authors: 100
  Venues: 50
  Training ratings: 846140
  Validation ratings: 282047
  TBR (wishlist) items: 328839
  Combined implicit feedback points: 880622
Using device: cpu


In [8]:
# Train model
model, best_rmse = train_implicit_model(
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer,
    device=device,
    explicit_criterion=explicit_criterion,
    implicit_criterion=implicit_criterion,
    epochs=20,
    patience=5
)

print(f"Training complete! Best RMSE: {best_rmse:.4f}")

Epoch 1/20 - Train Loss: 2.8684, Val RMSE: 1.3985, LR: 0.001000
Model improved - saving checkpoint (RMSE: 1.3985)
Epoch 2/20 - Train Loss: 1.7543, Val RMSE: 1.1522, LR: 0.001000
Model improved - saving checkpoint (RMSE: 1.1522)
Epoch 3/20 - Train Loss: 0.9190, Val RMSE: 1.1103, LR: 0.001000
Model improved - saving checkpoint (RMSE: 1.1103)
Epoch 4/20 - Train Loss: 0.4465, Val RMSE: 0.9542, LR: 0.001000
Model improved - saving checkpoint (RMSE: 0.9542)
Epoch 5/20 - Train Loss: 0.3090, Val RMSE: 0.9831, LR: 0.001000
No improvement for 1 epochs
Epoch 6/20 - Train Loss: 0.2627, Val RMSE: 1.0113, LR: 0.001000
No improvement for 2 epochs
Epoch 7/20 - Train Loss: 0.2300, Val RMSE: 1.0375, LR: 0.001000
No improvement for 3 epochs
Epoch 8/20 - Train Loss: 0.2048, Val RMSE: 1.0594, LR: 0.000500
No improvement for 4 epochs
Epoch 9/20 - Train Loss: 0.1559, Val RMSE: 1.0845, LR: 0.000500
No improvement for 5 epochs
Early stopping triggered after 9 epochs
Training complete! Best RMSE: 0.9542


  model.load_state_dict(torch.load('best_implicit_model.pth'))


In [None]:
models = create_ensemble(num_users, num_items, device)
        
# Train each model
for i, model in enumerate(models):
    print(f"\nTraining model {i+1}/{len(models)}")
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4)
    train_enhanced(
        model=model,
        train_df=train_df_norm,
        valid_df=valid_df_norm,
        loader=train_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=15,
        patience=3
    )

# Create ensemble prediction function
def ensemble_pred_fn(sids, pids):
    return ensemble_predict(models, sids, pids, device)

# Evaluate ensemble
ensemble_rmse = evaluate(valid_df, ensemble_pred_fn)
print(f"Ensemble RMSE: {ensemble_rmse:.4f}")