In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
from typing import Callable, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from itertools import chain

In [12]:
class RatingDataset(Dataset):
    def __init__(self, df):
        self.sids = df['sid'].values.astype(np.int64)
        self.pids = df['pid'].values.astype(np.int64)
        self.ratings = df['rating'].values.astype(np.float32)
        
    def __len__(self):
        return len(self.ratings)
        
    def __getitem__(self, idx):
        return self.sids[idx], self.pids[idx], self.ratings[idx]

    
class ImplicitFeedbackDataset(Dataset):
    """Dataset for both explicit ratings and implicit feedback"""
    def __init__(self, explicit_df, implicit_df=None, author_map=None, venue_map=None):
        self.sids = explicit_df['sid'].values.astype(np.int64)
        self.pids = explicit_df['pid'].values.astype(np.int64)
        
        # Get ratings from explicit feedback
        self.ratings = explicit_df['rating'].values.astype(np.float32)
        
        # Get implicit feedback (1 for items in wishlist, 0 otherwise)
        if implicit_df is not None:
            # Create mapping of (sid, pid) to implicit feedback
            implicit_map = {(row['sid'], row['pid']): 1 for _, row in implicit_df.iterrows()}
            
            # Get implicit feedback for each (sid, pid) pair in explicit_df
            self.implicit_feedback = np.array([
                implicit_map.get((sid, pid), 0) 
                for sid, pid in zip(self.sids, self.pids)
            ], dtype=np.float32)
        else:
            # If no implicit feedback, use zeros
            self.implicit_feedback = np.zeros_like(self.ratings)
            
        # Get author and venue information if provided
        if author_map is not None:
            self.author_ids = np.array([author_map.get(pid, 0) for pid in self.pids], dtype=np.int64)
        else:
            self.author_ids = np.zeros_like(self.pids)
            
        if venue_map is not None:
            self.venue_ids = np.array([venue_map.get(pid, 0) for pid in self.pids], dtype=np.int64)
        else:
            self.venue_ids = np.zeros_like(self.pids)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return (
            self.sids[idx], 
            self.pids[idx], 
            self.author_ids[idx], 
            self.venue_ids[idx],
            self.ratings[idx], 
            self.implicit_feedback[idx]
        )
    
class ImplicitNeuMF(nn.Module):
    def __init__(
        self, 
        num_users, 
        num_items, 
        embedding_dim=64, 
        mlp_dims=[128, 64, 32], 
        dropout_rate=0.3
    ):
        super(ImplicitNeuMF, self).__init__()
        
        # Base embeddings for both pathways
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))
        
        # MLP pathway
        mlp_layers = []
        in_size = embedding_dim * 2
        for out_size in mlp_dims:
            mlp_layers += [
                nn.Linear(in_size, out_size),
                nn.BatchNorm1d(out_size),
                nn.LeakyReLU(0.1),
                nn.Dropout(dropout_rate)
            ]
            in_size = out_size
        self.mlp = nn.Sequential(*mlp_layers)
        
        # TBR-specific pathway (for incorporating wishlist data)
        self.tbr_user_embedding = nn.Embedding(num_users, embedding_dim)
        self.tbr_item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Final prediction layers
        self.explicit_predictor = nn.Linear(mlp_dims[-1], 1)
        self.tbr_predictor = nn.Linear(embedding_dim * 2, 1)
        
        # Learnable weight for balancing explicit and implicit signals
        self.implicit_weight = nn.Parameter(torch.tensor(0.2))
        
        self._init_weights()
        
    def _init_weights(self):
        # Initialize embeddings
        for module in [self.user_embedding, self.item_embedding, 
                      self.tbr_user_embedding, self.tbr_item_embedding]:
            nn.init.normal_(module.weight, std=0.01)
        
        # Initialize bias
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
        # Initialize linear layers
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        
        nn.init.xavier_uniform_(self.explicit_predictor.weight)
        nn.init.xavier_uniform_(self.tbr_predictor.weight)
    
    def forward(self, user_indices, item_indices, is_eval=False):
        # Base embeddings
        user_embed = self.user_embedding(user_indices)
        item_embed = self.item_embedding(item_indices)
        
        # MLP for explicit ratings
        concat = torch.cat([user_embed, item_embed], dim=1)
        mlp_output = self.mlp(concat)
        
        # Explicit score with bias terms
        explicit_score = self.explicit_predictor(mlp_output).squeeze()
        u_bias = self.user_bias(user_indices).squeeze()
        i_bias = self.item_bias(item_indices).squeeze()
        explicit_score = explicit_score + u_bias + i_bias + self.global_bias
        
        # During evaluation, just use the explicit pathway which should be more reliable
        if is_eval:
            return torch.clamp(explicit_score, 1.0, 5.0)
        
        # TBR pathway for implicit feedback
        tbr_user = self.tbr_user_embedding(user_indices)
        tbr_item = self.tbr_item_embedding(item_indices)
        tbr_concat = torch.cat([tbr_user, tbr_item], dim=1)
        tbr_score = self.tbr_predictor(tbr_concat).squeeze()
        
        # Combine with learned weight
        weight = torch.sigmoid(self.implicit_weight)  # between 0 and 1
        final_score = (1 - weight) * explicit_score + weight * tbr_score
        
        return torch.clamp(final_score, 1.0, 5.0)
    
    def predict(self, user_indices, item_indices):
        return self.forward(user_indices, item_indices, is_eval=True)

In [None]:
DATA_DIR = "./data"

def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets with a 75/25 split."""
    
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

    # Split sid_pid into sid and pid columns
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    
    # Split into train and validation dataset
    train_df, valid_df = train_test_split(df, test_size=0.25)
    return train_df, valid_df

# Read wishlist data
def read_tbr_data() -> pd.DataFrame:
    """Reads the to-be-read (wishlist) data."""
    df = pd.read_csv(os.path.join(DATA_DIR, "train_tbr.csv"))
    # No need to split sid_pid since columns are already separate
    df[["sid", "pid"]] = df[["sid", "pid"]].astype(int)
    return df

def evaluate_model(model, data_loader, device, use_explicit_only=False):
    """
    Evaluate the model on the validation set
    """
    model.eval()
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for user_ids, item_ids, _, _, ratings, _ in data_loader:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            
            predictions = model(user_ids, item_ids, is_eval=use_explicit_only)
            
            all_preds.append(predictions.cpu().numpy())
            all_targets.append(ratings.cpu().numpy())
    
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    
    rmse = root_mean_squared_error(all_targets, all_preds)
    return rmse

def preprocess_data(df):
    """Apply user-specific normalization to ratings"""
    # Get global mean
    global_mean = df['rating'].mean()
    print(f"Global mean rating: {global_mean:.4f}")
    
    # Get user biases (average rating deviation from global mean)
    user_biases = df.groupby('sid')['rating'].mean() - global_mean
    
    # Create a copy to avoid modifying the original dataframe
    df_norm = df.copy()
    
    # Normalize ratings by user bias
    def normalize_rating(row):
        user_id = row['sid']
        return row['rating'] - user_biases.get(user_id, 0)
    
    df_norm['rating'] = df_norm.apply(normalize_rating, axis=1)
    
    return df_norm, user_biases, global_mean

def prepare_implicit_feedback(ratings_df, tbr_df, threshold=3.5):
    """
    Prepare implicit feedback from both ratings and TBR datasets
    
    Args:
        ratings_df: DataFrame with explicit ratings
        tbr_df: DataFrame with to-be-read items
        threshold: Rating threshold to consider as positive feedback (default: 3.5)
        
    Returns:
        DataFrame with user_id, item_id, and implicit feedback
    """
    # Get positive implicit feedback from high ratings
    positive_ratings = ratings_df[ratings_df['rating'] >= threshold][['sid', 'pid']]
    positive_ratings['implicit'] = 1
    
    # Get implicit feedback from TBR items
    tbr_implicit = tbr_df[['sid', 'pid']].copy()
    tbr_implicit['implicit'] = 1
    
    # Combine both sources and remove duplicates
    implicit_feedback = pd.concat([positive_ratings, tbr_implicit]).drop_duplicates(['sid', 'pid'])
    
    return implicit_feedback

def train_implicit_model(model, train_loader, valid_loader, optimizer, device, 
                         explicit_criterion=nn.MSELoss(),
                         implicit_criterion=nn.BCEWithLogitsLoss(),
                         implicit_weight=0.2,  # Weight for implicit loss
                         epochs=20, patience=5):
    """
    Training with staged approach, incorporating both explicit and implicit losses
    """
    best_rmse = float('inf')
    early_stop_counter = 0
    
    # Stage 1: Train only explicit pathway (5 epochs)
    print("Stage 1: Training explicit pathway only...")
    
    # Create optimizer that only updates explicit pathway parameters
    explicit_params = [
        model.user_embedding.parameters(),
        model.item_embedding.parameters(),
        model.user_bias.parameters(), 
        model.item_bias.parameters(),
        model.mlp.parameters(),
        model.explicit_predictor.parameters()
    ]
    explicit_optimizer = torch.optim.Adam(chain(*explicit_params), lr=0.001)
    
    for epoch in range(1, 6):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            user_ids, item_ids, _, _, ratings, _ = batch
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            ratings = ratings.to(device)
            
            explicit_optimizer.zero_grad()
            # Pass is_eval=True to only use explicit pathway
            predictions = model(user_ids, item_ids, is_eval=True)
            loss = explicit_criterion(predictions, ratings)
            loss.backward()
            explicit_optimizer.step()
            total_loss += loss.item()
        
        # Evaluate
        val_rmse = evaluate_model(model, valid_loader, device, use_explicit_only=True)
        print(f"Epoch {epoch}/5 (Explicit) - Val RMSE: {val_rmse:.4f}")
    
    # Stage 2: Now train both pathways together
    print("\nStage 2: Fine-tuning with implicit feedback...")
    for epoch in range(6, epochs+1):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            user_ids, item_ids, _, _, ratings, implicit_fbk = batch
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            ratings = ratings.to(device)
            implicit_fbk = implicit_fbk.to(device)
            
            optimizer.zero_grad()
            
            # Get full model output (includes both explicit and implicit paths)
            predictions = model(user_ids, item_ids)
            
            # Calculate explicit rating loss
            explicit_loss = explicit_criterion(predictions, ratings)
            
            # Extract the implicit predictions
            # We need to access the tbr_predictor output before sigmoid activation
            with torch.no_grad():  # We don't need gradients for this
                user_embed = model.tbr_user_embedding(user_ids)
                item_embed = model.tbr_item_embedding(item_ids)
                tbr_concat = torch.cat([user_embed, item_embed], dim=1)
                implicit_preds_raw = model.tbr_predictor(tbr_concat).squeeze()
            
            # Calculate implicit loss
            implicit_loss = implicit_criterion(implicit_preds_raw, implicit_fbk)
            
            # Combine losses
            loss = explicit_loss + implicit_weight * implicit_loss
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Evaluate
        val_rmse = evaluate_model(model, valid_loader, device)
        
        print(f"Epoch {epoch}/{epochs} - Val RMSE: {val_rmse:.4f}")
        
        # Check for improvement
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            early_stop_counter = 0
            torch.save(model.state_dict(), 'best_implicit_model.pth')
        else:
            early_stop_counter += 1
        
        # Early stopping
        if early_stop_counter >= patience:
            print(f"Early stopping triggered after {epoch} epochs")
            break
    
    # Load best model
    model.load_state_dict(torch.load('best_implicit_model.pth', weight_only=True))
    return model, best_rmse

def train_implicit_model_enhanced(model, train_loader, valid_loader, optimizer, device, 
                         explicit_criterion=nn.MSELoss(),
                         implicit_criterion=nn.BCEWithLogitsLoss(),
                         implicit_weight=0.2,
                         stage1_epochs=20,      # Increased from 5
                         stage1_patience=5,     # New parameter
                         stage2_epochs=20,      # Total epochs (was 20)
                         stage2_patience=5):
    """
    Enhanced training with early stopping for both stages
    """
    # Stage 1: Train only explicit pathway with early stopping
    print("Stage 1: Training explicit pathway only...")
    
    explicit_params = [
        model.user_embedding.parameters(),
        model.item_embedding.parameters(),
        model.user_bias.parameters(), 
        model.item_bias.parameters(),
        model.mlp.parameters(),
        model.explicit_predictor.parameters()
    ]
    explicit_optimizer = torch.optim.Adam(chain(*explicit_params), lr=0.001)
    
    # Initialize tracking variables for stage 1
    best_stage1_rmse = float('inf')
    stage1_counter = 0
    
    for epoch in range(1, stage1_epochs + 1):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            user_ids, item_ids, _, _, ratings, _ = batch
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            ratings = ratings.to(device)
            
            explicit_optimizer.zero_grad()
            predictions = model(user_ids, item_ids, is_eval=True)
            loss = explicit_criterion(predictions, ratings)
            loss.backward()
            explicit_optimizer.step()
            total_loss += loss.item()
        
        # Evaluate
        val_rmse = evaluate_model(model, valid_loader, device, use_explicit_only=True)
        print(f"Epoch {epoch}/{stage1_epochs} (Explicit) - Val RMSE: {val_rmse:.4f}")
        
        # Check for improvement
        if val_rmse < best_stage1_rmse:
            best_stage1_rmse = val_rmse
            stage1_counter = 0
            # Save best stage 1 model
            torch.save(model.state_dict(), 'best_stage1_model.pth')
            print(f"  Stage 1 model improved - saving checkpoint")
        else:
            stage1_counter += 1
            print(f"  No improvement for {stage1_counter} epochs")
        
        # Early stopping for stage 1
        if stage1_counter >= stage1_patience:
            print(f"Early stopping stage 1 after {epoch} epochs")
            break
    
    # Load the best model from stage 1
    print("Loading best stage 1 model for fine-tuning...")
    model.load_state_dict(torch.load('best_stage1_model.pth', weight_only=True))
    
    # Initialize tracking variables for stage 2
    best_stage2_rmse = float('inf')
    stage2_counter = 0
    
    # Stage 2: Fine-tune with implicit feedback
    print("\nStage 2: Fine-tuning with implicit feedback...")
    
    # Learning rate scheduler for stage 2
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, min_lr=1e-6
    )
    
    for epoch in range(1, stage2_epochs + 1):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            user_ids, item_ids, _, _, ratings, implicit_fbk = batch
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            ratings = ratings.to(device)
            implicit_fbk = implicit_fbk.to(device)
            
            optimizer.zero_grad()
            
            # Full forward pass
            predictions = model(user_ids, item_ids)
            
            # Calculate explicit rating loss
            explicit_loss = explicit_criterion(predictions, ratings)
            
            # Get implicit predictions for loss calculation
            tbr_user = model.tbr_user_embedding(user_ids)
            tbr_item = model.tbr_item_embedding(item_ids)
            tbr_concat = torch.cat([tbr_user, tbr_item], dim=1)
            implicit_preds = model.tbr_predictor(tbr_concat).squeeze()
            
            # Calculate implicit loss
            implicit_loss = implicit_criterion(implicit_preds, implicit_fbk)
            
            # Combined loss
            loss = explicit_loss + implicit_weight * implicit_loss
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Evaluate
        val_rmse = evaluate_model(model, valid_loader, device)
        
        # Update learning rate
        scheduler.step(val_rmse)
        
        print(f"Epoch {epoch}/{stage2_epochs} - Val RMSE: {val_rmse:.4f}, "
              f"LR: {optimizer.param_groups[0]['lr']:.6f}")
        
        # Check for improvement
        if val_rmse < best_stage2_rmse:
            best_stage2_rmse = val_rmse
            stage2_counter = 0
            torch.save(model.state_dict(), 'best_implicit_model.pth')
            print(f"  Model improved - saving checkpoint")
        else:
            stage2_counter += 1
            print(f"  No improvement for {stage2_counter} epochs")
        
        # Early stopping for stage 2
        if stage2_counter >= stage2_patience:
            print(f"Early stopping stage 2 after {epoch} epochs")
            break
    
    # Load best model
    model.load_state_dict(torch.load('best_implicit_model.pth', weight_only=True))
    return model, min(best_stage1_rmse, best_stage2_rmse)  # Return the best overall RMSE

In [13]:
# Load and prepare data
train_df, valid_df = read_data_df()
tbr_data = read_tbr_data()

# Prepare implicit feedback data
implicit_data = prepare_implicit_feedback(train_df, tbr_data, threshold=4.0)

In [14]:
# Determin dimensions
num_users = max(train_df['sid'].max(), tbr_data['sid'].max()) + 1
num_items = max(train_df['pid'].max(), tbr_data['pid'].max()) + 1

# Create datasets
train_dataset = ImplicitFeedbackDataset(
    train_df, 
    implicit_data
)

valid_dataset = ImplicitFeedbackDataset(
    valid_df,
    implicit_data
)

# Create data loaders
batch_size = 1024
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=4
)

valid_loader = DataLoader(
    valid_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=4
)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize model
model = ImplicitNeuMF(
    num_users=num_users,
    num_items=num_items,
    embedding_dim=64,
    mlp_dims=[128, 64, 32],
    dropout_rate=0.3
).to(device)

# Setup optimizer and loss functions
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
explicit_criterion = nn.MSELoss()
implicit_criterion = nn.BCEWithLogitsLoss()

Using device: cpu


In [15]:
# Train model
model, best_rmse = train_implicit_model(
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer,
    device=device,
    explicit_criterion=explicit_criterion,
    implicit_criterion=implicit_criterion,
    epochs=20,
    patience=5
)

print(f"Training complete! Best RMSE: {best_rmse:.4f}")

Stage 1: Training explicit pathway only...


Epoch 1/5 (Explicit) - Val RMSE: 1.1706
Epoch 2/5 (Explicit) - Val RMSE: 1.0575
Epoch 3/5 (Explicit) - Val RMSE: 0.9925
Epoch 4/5 (Explicit) - Val RMSE: 0.9353
Epoch 5/5 (Explicit) - Val RMSE: 0.9224

Stage 2: Fine-tuning with implicit feedback...
Epoch 6/20 - Val RMSE: 0.9113
Epoch 7/20 - Val RMSE: 0.9038
Epoch 8/20 - Val RMSE: 0.8886
Epoch 9/20 - Val RMSE: 0.8810
Epoch 10/20 - Val RMSE: 0.8720
Epoch 11/20 - Val RMSE: 0.8714
Epoch 12/20 - Val RMSE: 0.8706
Epoch 13/20 - Val RMSE: 0.8701
Epoch 14/20 - Val RMSE: 0.8702
Epoch 15/20 - Val RMSE: 0.8706
Epoch 16/20 - Val RMSE: 0.8698
Epoch 17/20 - Val RMSE: 0.8701
Epoch 18/20 - Val RMSE: 0.8699
Epoch 19/20 - Val RMSE: 0.8714
Epoch 20/20 - Val RMSE: 0.8723
Training complete! Best RMSE: 0.8698


  model.load_state_dict(torch.load('best_implicit_model.pth'))


In [16]:
# Setup optimizer and loss functions
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
explicit_criterion = nn.MSELoss()
implicit_criterion = nn.BCEWithLogitsLoss()

In [20]:
model, best_rmse = train_implicit_model_enhanced(
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer,
    device=device,
    explicit_criterion=explicit_criterion,
    implicit_criterion=implicit_criterion
)

print(f"Training complete! Best RMSE: {best_rmse:.4f}")

Stage 1: Training explicit pathway only...
Epoch 1/20 (Explicit) - Val RMSE: 0.8721
  Stage 1 model improved - saving checkpoint
Epoch 2/20 (Explicit) - Val RMSE: 0.8714
  Stage 1 model improved - saving checkpoint
Epoch 3/20 (Explicit) - Val RMSE: 0.8724
  No improvement for 1 epochs
Epoch 4/20 (Explicit) - Val RMSE: 0.8743
  No improvement for 2 epochs
Epoch 5/20 (Explicit) - Val RMSE: 0.8731
  No improvement for 3 epochs
Epoch 6/20 (Explicit) - Val RMSE: 0.8743
  No improvement for 4 epochs
Epoch 7/20 (Explicit) - Val RMSE: 0.8741
  No improvement for 5 epochs
Early stopping stage 1 after 7 epochs
Loading best stage 1 model for fine-tuning...

Stage 2: Fine-tuning with implicit feedback...


  model.load_state_dict(torch.load('best_stage1_model.pth'))


Epoch 1/20 - Val RMSE: 0.8749, LR: 0.001000
  Model improved - saving checkpoint
Epoch 2/20 - Val RMSE: 0.8732, LR: 0.001000
  Model improved - saving checkpoint
Epoch 3/20 - Val RMSE: 0.8742, LR: 0.001000
  No improvement for 1 epochs
Epoch 4/20 - Val RMSE: 0.8758, LR: 0.001000
  No improvement for 2 epochs
Epoch 5/20 - Val RMSE: 0.8769, LR: 0.000500
  No improvement for 3 epochs
Epoch 6/20 - Val RMSE: 0.8788, LR: 0.000500
  No improvement for 4 epochs
Epoch 7/20 - Val RMSE: 0.8787, LR: 0.000500
  No improvement for 5 epochs
Early stopping stage 2 after 7 epochs
Training complete! Best RMSE: 0.8714


  model.load_state_dict(torch.load('best_implicit_model.pth'))
