In [1]:
import torch
import torch.nn as nn
import os
import torch.nn.functional as F
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import pickle
from scipy.sparse import load_npz
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import optuna
import wandb

  from .autonotebook import tqdm as notebook_tqdm


# Recommenders - MF

In [None]:
class MFWithBiasesFreeze(nn.Module):
    def __init__(self, df2, emb_dim, device):
        super(MFWithBiasesFreeze, self).__init__()
        
        self.device = device
        
        # 1) Find total number of users/items
        self.num_users = df2['user_idx'].max() + 1
        self.num_items = df2['item_idx'].max() + 1
        
        # 2) Compute global mean rating
        global_mean = df2['rating'].mean()
        
        # 3) user biases
        user_means = df2.groupby('user_idx')['rating'].mean()
        user_bias_series = user_means - global_mean            # shift by global_mean
        # Reindex if needed to ensure we have a bias for every user_idx up to max()
        user_bias_series = user_bias_series.reindex(range(self.num_users),fill_value=0.0)
        
        # 4) item biases
        item_means = df2.groupby('item_idx')['rating'].mean()
        item_bias_series = item_means - global_mean
        item_bias_series = item_bias_series.reindex(range(self.num_items),fill_value=0.0)
        
        # Convert to torch Tensors for initialization
        self.user_bias_init = torch.tensor(user_bias_series.values, dtype=torch.float32)
        self.item_bias_init = torch.tensor(item_bias_series.values, dtype=torch.float32)
        
        # 5) Define model parameters
        self.global_bias = nn.Parameter(torch.tensor([global_mean], dtype=torch.float32))
        
        self.user_bias = nn.Embedding(self.num_users, 1)
        self.item_bias = nn.Embedding(self.num_items, 1)
        
        #    c) user and item embeddings
        self.user_embedding = nn.Embedding(self.num_users, emb_dim)
        self.item_embedding = nn.Embedding(self.num_items, emb_dim)
        
        # 6) Initialize everything
        self._init_parameters()
        
        # 7) Move to device
        self.to(self.device)


    def _init_parameters(self): #Helper to initialize embeddings and biases from the precomputed stats

        # a) user/item embeddings: small random Normal
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        
        # b) user/item biases from data
        with torch.no_grad():
            self.user_bias.weight.copy_(self.user_bias_init.view(-1, 1))
            self.item_bias.weight.copy_(self.item_bias_init.view(-1, 1))
            
        # c) Freeze biases
        self.user_bias.weight.requires_grad = False
        self.item_bias.weight.requires_grad = False
        self.global_bias.requires_grad = False


    def forward(self, user_idx, item_idx):
        # 1) Retrieve bias terms
        b_u = self.user_bias(user_idx).squeeze()  # shape: [batch_size]
        b_i = self.item_bias(item_idx).squeeze()  # shape: [batch_size]
        
        # 2) Retrieve latent embeddings
        u_emb = self.user_embedding(user_idx)      # shape: [batch_size, emb_dim]
        i_emb = self.item_embedding(item_idx)      # shape: [batch_size, emb_dim]
        
        # 3) Dot product
        dot = (u_emb * i_emb).sum(dim=1)
        
        # 4) Final rating
        rating_pred = self.global_bias + b_u + b_i + dot

        return rating_pred

## Adding to the MFWithBiasesFreeze the encoded info about the metadata

In [None]:
class MFWithBiasesFreezeANDmetadata(nn.Module):
    def __init__(self, df2, emb_dim, compressed_items_encodings, device):
        super(MFWithBiasesFreezeANDmetadata, self).__init__()
        
        self.device = device
        
        # 1) Find total number of users/items
        self.num_users = df2['user_idx'].max() + 1
        self.num_items = df2['item_idx'].max() + 1
        
        # 2) Compute global mean rating
        global_mean = df2['rating'].mean()
        
        # 3) user biases
        user_means = df2.groupby('user_idx')['rating'].mean()
        user_bias_series = user_means - global_mean            # shift by global_mean
        # Reindex if needed to ensure we have a bias for every user_idx up to max()
        user_bias_series = user_bias_series.reindex(range(self.num_users),fill_value=0.0)
        
        # 4) item biases
        item_means = df2.groupby('item_idx')['rating'].mean()
        item_bias_series = item_means - global_mean
        item_bias_series = item_bias_series.reindex(range(self.num_items),fill_value=0.0)
        
        # Convert to torch Tensors for initialization
        self.user_bias_init = torch.tensor(user_bias_series.values, dtype=torch.float32)
        self.item_bias_init = torch.tensor(item_bias_series.values, dtype=torch.float32)
        
        # 5) Define model parameters
        self.global_bias = nn.Parameter(torch.tensor([global_mean], dtype=torch.float32))
        
        self.user_bias = nn.Embedding(self.num_users, 1)
        self.item_bias = nn.Embedding(self.num_items, 1)
        
        #    c) user and item embeddings
        self.user_embedding = nn.Embedding(self.num_users, emb_dim)
        self.item_embedding = nn.Embedding(self.num_items, emb_dim)

        self.item_encodings = compressed_items_encodings  # {item_idx: encoding_vector}
        encoding_dim = len(next(iter(compressed_items_encodings.values())))  # Get encoding size dynamically

        # 6) Projection layer to transform concatenated item representation
        self.item_proj = nn.Linear(encoding_dim + emb_dim, emb_dim)  # Project to emb_dim

        # 6) Initialize everything
        self._init_parameters()
        
        # 7) Move to device
        self.to(self.device)


    def _init_parameters(self): #Helper to initialize embeddings and biases from the precomputed stats

        # a) user/item embeddings: small random Normal
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        
        # b) user/item biases from data
        with torch.no_grad():
            self.user_bias.weight.copy_(self.user_bias_init.view(-1, 1))
            self.item_bias.weight.copy_(self.item_bias_init.view(-1, 1))
            
        # c) Freeze biases
        self.user_bias.weight.requires_grad = False
        self.item_bias.weight.requires_grad = False
        self.global_bias.requires_grad = False


    def forward(self, user_idx, item_idx):
        # 1) Retrieve bias terms
        b_u = self.user_bias(user_idx).squeeze()  # shape: [batch_size]
        b_i = self.item_bias(item_idx).squeeze()  # shape: [batch_size]
        
        # 2) Retrieve latent embeddings
        u_emb = self.user_embedding(user_idx)      # shape: [batch_size, emb_dim]
        i_emb = self.item_embedding(item_idx)      # shape: [batch_size, emb_dim]

        # 3) Retrieve item encoding vector and concatenate with item embedding
        batch_size = item_idx.shape[0]
        item_vectors = torch.stack([torch.tensor(self.item_encodings.get(int(idx)),dtype=torch.float32, device=self.device)for idx in item_idx])  # Shape: [batch_size, encoding_dim]

        combined_item_rep = torch.cat([i_emb, item_vectors], dim=1)  # Shape: [batch_size, emb_dim + encoding_dim]

        # 4) Project concatenated vector back to emb_dim
        i_emb_projected = self.item_proj(combined_item_rep)  # Shape: [batch_size, emb_dim]

        # 5) Compute dot product with user embedding
        dot = (u_emb * i_emb_projected).sum(dim=1)

        # 6) Compute final predicted rating
        rating_pred = self.global_bias + b_u + b_i + dot

        return rating_pred

## Changed the name and added the clamp in the last row of the forward

In [None]:
class MFWithBiasClamp(nn.Module):
    def __init__(self, df2, emb_dim, device):
        super(MFWithBiasClamp, self).__init__()
        
        self.device = device
        
        # 1) Find total number of users/items
        self.num_users = df2['user_idx'].max() + 1
        self.num_items = df2['item_idx'].max() + 1
        
        # 2) Compute global mean rating
        global_mean = df2['rating'].mean()
        
        # 3) user biases
        user_means = df2.groupby('user_idx')['rating'].mean()
        user_bias_series = user_means - global_mean            # shift by global_mean
        # Reindex if needed to ensure we have a bias for every user_idx up to max()
        user_bias_series = user_bias_series.reindex(range(self.num_users),fill_value=0.0)
        
        # 4) item biases
        item_means = df2.groupby('item_idx')['rating'].mean()
        item_bias_series = item_means - global_mean
        item_bias_series = item_bias_series.reindex(range(self.num_items),fill_value=0.0)
        
        # Convert to torch Tensors for initialization
        self.user_bias_init = torch.tensor(user_bias_series.values, dtype=torch.float32)
        self.item_bias_init = torch.tensor(item_bias_series.values, dtype=torch.float32)
        
        # 5) Define model parameters
        self.global_bias = nn.Parameter(torch.tensor([global_mean], dtype=torch.float32))
        
        self.user_bias = nn.Embedding(self.num_users, 1)
        self.item_bias = nn.Embedding(self.num_items, 1)
        
        #    c) user and item embeddings
        self.user_embedding = nn.Embedding(self.num_users, emb_dim)
        self.item_embedding = nn.Embedding(self.num_items, emb_dim)
        
        # 6) Initialize everything
        self._init_parameters()
        
        # 7) Move to device
        self.to(self.device)


    def _init_parameters(self): #Helper to initialize embeddings and biases from the precomputed stats

        # a) user/item embeddings: small random Normal
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        
        # b) user/item biases from data
        with torch.no_grad():
            self.user_bias.weight.copy_(self.user_bias_init.view(-1, 1))
            self.item_bias.weight.copy_(self.item_bias_init.view(-1, 1))


    def forward(self, user_idx, item_idx):
        # 1) Retrieve bias terms
        b_u = self.user_bias(user_idx).squeeze()  # shape: [batch_size]
        b_i = self.item_bias(item_idx).squeeze()  # shape: [batch_size]
        
        # 2) Retrieve latent embeddings
        u_emb = self.user_embedding(user_idx)      # shape: [batch_size, emb_dim]
        i_emb = self.item_embedding(item_idx)      # shape: [batch_size, emb_dim]
        
        # 3) Dot product
        dot = (u_emb * i_emb).sum(dim=1)
        
        # 4) Final rating
        rating_pred = self.global_bias + b_u + b_i + dot

        #### ADD CLAMP ####
        rating_pred = torch.clamp(rating_pred, min=1.0, max=5.0) #### ADD CLAMP ####

        return rating_pred

In [None]:
class MFNoItemEmbedding(nn.Module):
    def __init__(self, df2, emb_dim, compressed_items_encodings, device):
        super(MFNoItemEmbedding, self).__init__()
        
        self.device = device
        
        # 1) Find total number of users
        self.num_users = df2['user_idx'].max() + 1
        
        # 2) Compute global mean rating
        global_mean = df2['rating'].mean()
        
        # 3) Compute user biases
        user_means = df2.groupby('user_idx')['rating'].mean()
        user_bias_series = user_means - global_mean
        user_bias_series = user_bias_series.reindex(range(self.num_users), fill_value=0.0)
        
        # 4) Compute item biases
        item_means = df2.groupby('item_idx')['rating'].mean()
        item_bias_series = item_means - global_mean
        item_bias_series = item_bias_series.reindex(compressed_items_encodings.keys(), fill_value=0.0)
        
        # Convert biases to torch Tensors
        self.user_bias_init = torch.tensor(user_bias_series.values, dtype=torch.float32)
        self.item_bias_init = torch.tensor(item_bias_series.values, dtype=torch.float32)
        
        # 5) Define model parameters
        self.global_bias = nn.Parameter(torch.tensor([global_mean], dtype=torch.float32))
        
        self.user_bias = nn.Embedding(self.num_users, 1)
        self.item_bias = nn.Embedding(len(compressed_items_encodings), 1)  # Biases only for items
        
        # c) User embeddings (Items are fixed from precomputed encodings)
        self.user_embedding = nn.Embedding(self.num_users, emb_dim)

        # Use provided item encodings
        self.item_encodings = compressed_items_encodings  
        encoding_dim = len(next(iter(compressed_items_encodings.values())))

        # 6) Projection layer to transform the item representation
        self.item_proj = nn.Linear(encoding_dim, emb_dim)  # Project to emb_dim

        # 7) Initialize everything
        self._init_parameters()

        # 8) Move to device
        self.to(self.device)

    def _init_parameters(self):
        # a) User embeddings: small random Normal
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        
        # b) User/item biases from data
        with torch.no_grad():
            self.user_bias.weight.copy_(self.user_bias_init.view(-1, 1))
            self.item_bias.weight.copy_(self.item_bias_init.view(-1, 1))

        # c) Freeze biases
        self.user_bias.weight.requires_grad = False
        self.item_bias.weight.requires_grad = False
        self.global_bias.requires_grad = False

    def forward(self, user_idx, item_idx):
        # 1) Retrieve bias terms
        b_u = self.user_bias(user_idx).squeeze()  # [batch_size]
        b_i = self.item_bias(item_idx).squeeze()  # [batch_size]

        # 2) Retrieve user embedding
        u_emb = self.user_embedding(user_idx)  # [batch_size, emb_dim]

        # 3) Use precomputed item encodings directly
        batch_size = item_idx.shape[0]
        item_vectors = torch.stack([torch.tensor(self.item_encodings[int(idx)], dtype=torch.float32, device=self.device) for idx in item_idx])

        # 4) Project item encoding back to emb_dim
        i_emb_projected = self.item_proj(item_vectors)  # [batch_size, emb_dim]

        # 5) Compute dot product with user embedding
        dot = (u_emb * i_emb_projected).sum(dim=1)

        # 6) Compute final predicted rating
        rating_pred = self.global_bias + b_u + b_i + dot

        return rating_pred

# Recommenders NCF

In [None]:
class NCFRecommender(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_units, dropout, alpha, df2):
        super(NCFRecommender, self).__init__()
        
        # User and item embeddings for GMF & MLP
        self.user_embedding_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_gmf = nn.Embedding(num_items, embedding_dim)
        
        self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_mlp = nn.Embedding(num_items, embedding_dim)
        
        # GMF Layer (Element-wise multiplication)
        self.gmf_layer = nn.Linear(embedding_dim, 1)
        
        # MLP Layers
        layers = []
        input_dim = embedding_dim * 2
        for units in hidden_units:
            layers.append(nn.Linear(input_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_dim = units
        
        self.mlp = nn.Sequential(*layers)
        
        # Final fusion layer (combining GMF & MLP outputs)
        self.final_layer = nn.Linear(hidden_units[-1] + 1, 1)
        self.alpha = alpha  # Weighting factor for GMF and MLP
        
        # Load global, user, and item biases from the dataset
        self.global_bias = df2['rating'].mean()
        self.user_bias = torch.tensor(df2.groupby('user_idx')['rating'].mean() - self.global_bias, dtype=torch.float32)
        self.item_bias = torch.tensor(df2.groupby('item_idx')['rating'].mean() - self.global_bias, dtype=torch.float32)
        
        # Ensure biases are frozen (not trainable)
        self.user_bias = nn.Parameter(self.user_bias, requires_grad=False)
        self.item_bias = nn.Parameter(self.item_bias, requires_grad=False)
        
    def forward(self, user_idx, item_idx):
        # GMF forward pass
        user_emb_gmf = self.user_embedding_gmf(user_idx)
        item_emb_gmf = self.item_embedding_gmf(item_idx)
        gmf_output = self.gmf_layer(user_emb_gmf * item_emb_gmf)
        
        # MLP forward pass
        user_emb_mlp = self.user_embedding_mlp(user_idx)
        item_emb_mlp = self.item_embedding_mlp(item_idx)
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp], dim=-1)
        mlp_output = self.mlp(mlp_input)
        
        # Concatenation of GMF & MLP outputs
        final_input = torch.cat([gmf_output, mlp_output], dim=-1)
        output = self.final_layer(final_input)
        
        # Add global, user, and item biases
        output += self.global_bias + self.user_bias[user_idx].unsqueeze(1) + self.item_bias[item_idx].unsqueeze(1)

        
        # Ensure output is in range [0,5]
        return torch.clamp(output, 0, 5).squeeze(1)

In [None]:
class NCFRecommenderNoBias(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_units, dropout, alpha, df2):
        super(NCFRecommenderNoBias, self).__init__()
        
        # User and item embeddings for GMF & MLP
        self.user_embedding_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_gmf = nn.Embedding(num_items, embedding_dim)
        
        self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_mlp = nn.Embedding(num_items, embedding_dim)
        
        # GMF Layer (Element-wise multiplication)
        self.gmf_layer = nn.Linear(embedding_dim, 1)
        
        # MLP Layers
        layers = []
        input_dim = embedding_dim * 2
        for units in hidden_units:
            layers.append(nn.Linear(input_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_dim = units
        
        self.mlp = nn.Sequential(*layers)
        
        # Final fusion layer (combining GMF & MLP outputs)
        self.final_layer = nn.Linear(hidden_units[-1] + 1, 1)
        self.alpha = alpha  # Weighting factor for GMF and MLP
        
    def forward(self, user_idx, item_idx):
        # GMF forward pass
        user_emb_gmf = self.user_embedding_gmf(user_idx)
        item_emb_gmf = self.item_embedding_gmf(item_idx)
        gmf_output = self.gmf_layer(user_emb_gmf * item_emb_gmf)
        
        # MLP forward pass
        user_emb_mlp = self.user_embedding_mlp(user_idx)
        item_emb_mlp = self.item_embedding_mlp(item_idx)
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp], dim=-1)
        mlp_output = self.mlp(mlp_input)
        
        # Concatenation of GMF & MLP outputs
        final_input = torch.cat([gmf_output, mlp_output], dim=-1)
        output = self.final_layer(final_input)

        # Ensure output is in range [0,5]
        return torch.clamp(output, 0, 5).squeeze(1)

In [None]:
class NCFWithMetadata(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_units, dropout, alpha, df2, compressed_items_encodings, device):
        super(NCFWithMetadata, self).__init__()

        self.device = device
        self.compressed_items_encodings = compressed_items_encodings
        encoding_dim = len(next(iter(compressed_items_encodings.values())))  # Get encoding size dynamically
        
        # User and item embeddings
        self.user_embedding_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_gmf = nn.Embedding(num_items, embedding_dim)

        self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_mlp = nn.Embedding(num_items, embedding_dim)

        # Projection layer to transform concatenated item representation
        self.item_proj_gmf = nn.Linear(embedding_dim + encoding_dim, embedding_dim)
        self.item_proj_mlp = nn.Linear(embedding_dim + encoding_dim, embedding_dim)

        # GMF Layer (Element-wise multiplication)
        self.gmf_layer = nn.Linear(embedding_dim, 1)

        # MLP Layers
        layers = []
        input_dim = embedding_dim * 2
        for units in hidden_units:
            layers.append(nn.Linear(input_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_dim = units

        self.mlp = nn.Sequential(*layers)

        # Final fusion layer (combining GMF & MLP outputs)
        self.final_layer = nn.Linear(hidden_units[-1] + 1, 1)
        self.alpha = alpha  # Weighting factor for GMF and MLP

        # Load global, user, and item biases from the dataset
        self.global_bias = df2['rating'].mean()
        self.user_bias = torch.tensor(df2.groupby('user_idx')['rating'].mean() - self.global_bias, dtype=torch.float32)
        self.item_bias = torch.tensor(df2.groupby('item_idx')['rating'].mean() - self.global_bias, dtype=torch.float32)

        # Ensure biases are frozen (not trainable)
        self.user_bias = nn.Parameter(self.user_bias, requires_grad=False)
        self.item_bias = nn.Parameter(self.item_bias, requires_grad=False)

        # Move to device
        self.to(self.device)

    def forward(self, user_idx, item_idx):
        # GMF forward pass
        user_emb_gmf = self.user_embedding_gmf(user_idx)

        # Retrieve item embedding
        item_emb_gmf = self.item_embedding_gmf(item_idx)

        # Retrieve item metadata encoding with default zero vector for cold items
        item_metadata = torch.stack([
            torch.tensor(self.compressed_items_encodings.get(int(idx), torch.zeros(len(next(iter(self.compressed_items_encodings.values()))))), 
                         dtype=torch.float32, device=self.device) 
            for idx in item_idx])

        # Concatenate item embedding with metadata encoding
        combined_item_rep_gmf = torch.cat([item_emb_gmf, item_metadata], dim=1)
        combined_item_rep_mlp = torch.cat([self.item_embedding_mlp(item_idx), item_metadata], dim=1)

        # Project back to embedding_dim
        item_emb_gmf_projected = self.item_proj_gmf(combined_item_rep_gmf)
        item_emb_mlp_projected = self.item_proj_mlp(combined_item_rep_mlp)

        # Compute GMF output (Element-wise multiplication)
        gmf_output = self.gmf_layer(user_emb_gmf * item_emb_gmf_projected)

        # MLP forward pass
        user_emb_mlp = self.user_embedding_mlp(user_idx)
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp_projected], dim=-1)
        mlp_output = self.mlp(mlp_input)

        # Concatenation of GMF & MLP outputs
        final_input = torch.cat([gmf_output, mlp_output], dim=-1)
        output = self.final_layer(final_input)

        # Retrieve user and item biases
        user_bias = self.user_bias[user_idx].unsqueeze(1)

        # Use `.get()` to safely handle cold items with missing biases
        item_bias = torch.tensor([self.item_bias[idx].item() if idx < len(self.item_bias) else 0 
                                  for idx in item_idx], dtype=torch.float32, device=self.device).unsqueeze(1)

        # Add global, user, and item biases
        output += self.global_bias + user_bias + item_bias

        return torch.clamp(output, 0, 5).squeeze(1)

## NCF for cold items - no items embeddings

In [None]:
class NCFNoItemEmbedding(nn.Module):
    def __init__(self, num_users, embedding_dim, hidden_units, dropout, alpha, df2, compressed_items_encodings, device):
        super(NCFNoItemEmbedding, self).__init__()
        
        self.device = device
        self.compressed_items_encodings = compressed_items_encodings
        encoding_dim = len(next(iter(compressed_items_encodings.values())))  # Get encoding size dynamically
        
        # User embeddings for GMF & MLP
        self.user_embedding_gmf = nn.Embedding(num_users, embedding_dim)
        self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
        
        # Projection layer to transform item encodings
        self.item_proj_gmf = nn.Linear(encoding_dim, embedding_dim)
        self.item_proj_mlp = nn.Linear(encoding_dim, embedding_dim)
        
        # GMF Layer (Element-wise multiplication)
        self.gmf_layer = nn.Linear(embedding_dim, 1)
        
        # MLP Layers
        layers = []
        input_dim = embedding_dim * 2
        for units in hidden_units:
            layers.append(nn.Linear(input_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_dim = units
        
        self.mlp = nn.Sequential(*layers)
        
        # Final fusion layer (combining GMF & MLP outputs)
        self.final_layer = nn.Linear(hidden_units[-1] + 1, 1)
        self.alpha = alpha  # Weighting factor for GMF and MLP
        
        # Load global, user, and item biases from the dataset
        self.global_bias = df2['rating'].mean()
        self.user_bias = torch.tensor(df2.groupby('user_idx')['rating'].mean() - self.global_bias, dtype=torch.float32)
        self.item_bias = torch.tensor(df2.groupby('item_idx')['rating'].mean() - self.global_bias, dtype=torch.float32)
        
        # Ensure biases are frozen (not trainable)
        self.user_bias = nn.Parameter(self.user_bias, requires_grad=False)
        self.item_bias = nn.Parameter(self.item_bias, requires_grad=False)
        
        # Move to device
        self.to(self.device)

    def forward(self, user_idx, item_idx):
        # GMF forward pass
        user_emb_gmf = self.user_embedding_gmf(user_idx)  # (batch_size, embedding_dim)

        # Retrieve precomputed item encodings
        item_encodings = torch.stack([
            torch.tensor(self.compressed_items_encodings[int(idx)], dtype=torch.float32, device=self.device) for idx in item_idx
        ])  # (batch_size, encoding_dim)

        # Project item encodings to embedding_dim
        item_emb_gmf = self.item_proj_gmf(item_encodings)  # (batch_size, embedding_dim)
        gmf_output = self.gmf_layer(user_emb_gmf * item_emb_gmf)  # Element-wise multiplication
        
        # MLP forward pass
        user_emb_mlp = self.user_embedding_mlp(user_idx)  # (batch_size, embedding_dim)
        item_emb_mlp = self.item_proj_mlp(item_encodings)  # (batch_size, embedding_dim)
        
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp], dim=-1)  # (batch_size, 2 * embedding_dim)
        mlp_output = self.mlp(mlp_input)
        
        # Concatenation of GMF & MLP outputs
        final_input = torch.cat([gmf_output, mlp_output], dim=-1)
        output = self.final_layer(final_input)
        
        # Retrieve user and item biases
        user_bias = self.user_bias[user_idx].unsqueeze(1)
    
        # Use `.get()` to default missing item biases to zero
        item_bias = torch.tensor([self.item_bias[idx].item() if idx < len(self.item_bias) else 0 
                                  for idx in item_idx], dtype=torch.float32, device=self.device).unsqueeze(1)
    
        # Add global, user, and item biases
        output += self.global_bias + user_bias + item_bias

        # Ensure output is in range [0,5]
        return torch.clamp(output, 0, 5).squeeze(1)

# Pairwise Models

### Pairwise Warm - Experiment #1 Two Tower Model with Randomized Init 

In [None]:
# ======= Two-Tower Model (User & Item Networks) =======

class TwoTowerModelRandomizedInit(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Embedding)
        self.user_embedding = nn.Embedding(num_users, embedding_dim) 
        self.item_embedding = nn.Embedding(num_items, embedding_dim)  
        # Item Tower (Using Item Metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )

        # Second-Level Item Embedding Combination
        self.item_fc2 = nn.Sequential(
            nn.Linear(2 * embedding_dim, 512),  # Concatenating two embedding sources
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )
    
    def forward(self, user_ids, item1_ids, item2_ids):
        # User embedding
        user_ids=user_ids.to(device)
        item1_ids=item1_ids.to(device)
        item2_ids=item2_ids.to(device)
        user_embed = self.user_embedding(user_ids)  # (batch, embedding_dim)

        # Item metadata-based embedding
        item1_meta_embed = self.item_fc(item_metadata[item1_ids])  # (batch, embedding_dim)
        item2_meta_embed = self.item_fc(item_metadata[item2_ids])  # (batch, embedding_dim)

        # Item ID-based embedding (pretrained)
        item1_id_embed = self.item_embedding(item1_ids)  # (batch, embedding_dim)
        item2_id_embed = self.item_embedding(item2_ids)  # (batch, embedding_dim)

        # Concatenate metadata-based and ID-based embeddings
        item1_combined = torch.cat([item1_meta_embed, item1_id_embed], dim=1)  # (batch, 2*embedding_dim)
        item2_combined = torch.cat([item2_meta_embed, item2_id_embed], dim=1)  # (batch, 2*embedding_dim)

        # Second-Level Representation Learning
        item1_embed_level2 = self.item_fc2(item1_combined)  # (batch, embedding_dim)
        item2_embed_level2 = self.item_fc2(item2_combined)  # (batch, embedding_dim)

        return user_embed, item1_embed_level2, item2_embed_level2



class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -torch.log(torch.sigmoid(pos_score - neg_score)).mean()
        return loss


### Pairwise warm Experiment #2 Two Tower Model with previously trained by MF item and user embeddings

In [None]:
# ======= Two-Tower Model (User & Item Networks) =======

class TwoTowerModelPrevEmbedInit(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Embedding)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)  # LOAD PRETRAINED USER EMBEDDINGS
        self.item_embedding = nn.Embedding(num_items, embedding_dim)  # LOAD PRETRAINED ITEM EMBEDDINGS
        self.user_embedding.weight.data.copy_(initial_user_embed.weight.data)
        self.item_embedding.weight.data.copy_(initial_item_embed.weight.data)
        # Item Tower (Using Item Metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )

        # Second-Level Item Embedding Combination
        self.item_fc2 = nn.Sequential(
            nn.Linear(2 * embedding_dim, 512),  # Concatenating two embedding sources
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )
    
    def forward(self, user_ids, item1_ids, item2_ids):
        # User embedding
        user_ids=user_ids.to(device)
        item1_ids=item1_ids.to(device)
        item2_ids=item2_ids.to(device)
        user_embed = self.user_embedding(user_ids)  # (batch, embedding_dim)

        # Item metadata-based embedding
        item1_meta_embed = self.item_fc(item_metadata[item1_ids])  # (batch, embedding_dim)
        item2_meta_embed = self.item_fc(item_metadata[item2_ids])  # (batch, embedding_dim)

        # Item ID-based embedding (pretrained)
        item1_id_embed = self.item_embedding(item1_ids)  # (batch, embedding_dim)
        item2_id_embed = self.item_embedding(item2_ids)  # (batch, embedding_dim)

        # Concatenate metadata-based and ID-based embeddings
        item1_combined = torch.cat([item1_meta_embed, item1_id_embed], dim=1)  # (batch, 2*embedding_dim)
        item2_combined = torch.cat([item2_meta_embed, item2_id_embed], dim=1)  # (batch, 2*embedding_dim)

        # Second-Level Representation Learning
        item1_embed_level2 = self.item_fc2(item1_combined)  # (batch, embedding_dim)
        item2_embed_level2 = self.item_fc2(item2_combined)  # (batch, embedding_dim)

        return user_embed, item1_embed_level2, item2_embed_level2



class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -torch.log(torch.sigmoid(pos_score - neg_score)).mean()
        return loss


### Pairwise warm - Experiment #3 Metadata to pretrained item embeddings

In [None]:
class MetadataToEmbedding(nn.Module):
    def __init__(self, input_size=3075, output_size=24, hidden_sizes=[512, 256]):
        super(MetadataToEmbedding, self).__init__()
        layers = []
        sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(len(sizes) - 1):
            layers.append(nn.Linear(sizes[i], sizes[i + 1]))
            if i < len(sizes) - 2:
                layers.append(nn.ReLU())
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

def train_model(metadata, warm_item_embed, val_split=0.2, output_size=24, epochs=10, batch_size=64, lr=1e-3, hidden_sizes=[512, 256]):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Map item IDs to warm embeddings
    warm_embeddings = warm_item_embed.weight.detach().to(device)

    # Train-validation split
    val_size = int(len(metadata) * val_split)
    train_meta, val_meta = metadata[:-val_size], metadata[-val_size:]
    train_emb, val_emb = warm_embeddings[:-val_size], warm_embeddings[-val_size:]

    train_dataset = ItemDataset(train_meta, train_emb)
    val_dataset = ItemDataset(val_meta, val_emb)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model
    model = MetadataToEmbedding(input_size=metadata.shape[1], output_size=output_size, hidden_sizes=hidden_sizes).to(device)

    # Loss and Optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    best_val_loss = float("inf")

    for epoch in range(epochs):
        
        model.train()
        train_loss = 0.0

        for meta, emb in train_loader:
            meta, emb = meta.to(device), emb.to(device)

            optimizer.zero_grad()
            outputs = model(meta)
            loss = criterion(outputs, emb)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for meta, emb in val_loader:
                meta, emb = meta.to(device), emb.to(device)
                outputs = model(meta)
                val_loss += criterion(outputs, emb).item()

        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss / len(train_loader):.6f}, Val Loss: {val_loss / len(val_loader):.6f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_metadata_to_embedding.pth")

    print("Training complete. Best validation loss:", best_val_loss)
    return model

### Pairwise Warm Experiment #4 in warm is using the model described as the NFC model above

### Pairwise Cold Experiment #1 - Randomized weights

In [None]:
# ======= Two-Tower Model (User & Item Networks) =======
class TwoTowerModelPretrainedRandomInit(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Embedding)
        self.user_embedding = nn.Embedding(num_users, embedding_dim) 
 

        
        # Item Tower (Using Item Metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )
    
    def forward(self, user_ids, item1_ids, item2_ids):
        item1_ids=item1_ids.to(device)
        item2_ids=item2_ids.to(device)
        user_embed = self.user_embedding(user_ids)  # (batch, embedding_dim)
        item1_embed = self.item_fc(item_embeddings_tensor[item1_ids])  # (batch, embedding_dim)
        item2_embed = self.item_fc(item_embeddings_tensor[item2_ids])  # (batch, embedding_dim)
        
        return user_embed, item1_embed, item2_embed

# ======= Pairwise BPR Loss =======


class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -torch.log(torch.sigmoid(pos_score - neg_score)).mean()
        return loss


### Pairwise Cold Experiment #2 - pretrained on MF embeddings

In [None]:
# ======= Two-Tower Model (User & Item Networks) =======
class TwoTowerModelPretrainedUserEmbeddings(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Embedding)
        self.user_embedding = nn.Embedding(num_users, embedding_dim) 
        ### LOAD PRETRAINED USER EMBEDDINGS
        self.user_embedding.weight.data.copy_(initial_user_embed.weight.data)

        
        # Item Tower (Using Item Metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )
    
    def forward(self, user_ids, item1_ids, item2_ids):
        item1_ids=item1_ids.to(device)
        item2_ids=item2_ids.to(device)
        user_embed = self.user_embedding(user_ids)  # (batch, embedding_dim)
        item1_embed = self.item_fc(item_embeddings_tensor[item1_ids])  # (batch, embedding_dim)
        item2_embed = self.item_fc(item_embeddings_tensor[item2_ids])  # (batch, embedding_dim)
        
        return user_embed, item1_embed, item2_embed

# ======= Pairwise BPR Loss =======


class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -torch.log(torch.sigmoid(pos_score - neg_score)).mean()
        return loss


### Pairwise Cold Experiment #3 - Meta Data

In [None]:
# ======= Two-Tower Model (User & Item Networks) =======
class TwoTowerModel(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Embedding)
        self.user_fc = nn.Sequential(
            nn.Linear(1027, 512),  # First reduce to 512 dimensions
            nn.ReLU(),
            nn.Linear(512, embedding_dim),  # Then reduce to the desired embedding_dim (24)
        )
        # Item Tower (Using Item Metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )

        
    def forward(self, user_ids, item1_ids, item2_ids):
        user_ids = user_ids.long().to(device)
        
        item1_ids=item1_ids.to(device)
        item2_ids=item2_ids.to(device)
        # User Tower: Compute user embeddings using user_fc (sequential)
        user_embed = self.user_fc(user_embeddings[user_ids])  # (batch_size, embedding_dim)

        item1_embed = self.item_fc(item_embeddings_tensor[item1_ids])  # (batch, embedding_dim)
        item2_embed = self.item_fc(item_embeddings_tensor[item2_ids])  # (batch, embedding_dim)
        
        return user_embed, item1_embed, item2_embed

# ======= Pairwise BPR Loss =======


class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -F.logsigmoid(pos_score - neg_score).mean()
        return loss


### Pairwise cold - Experiment 4 - Bert features

In [None]:
class TwoTowerModelBertFeatures(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Processes item metadata of past interactions)
        self.user_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )
        # Item Tower (Processes item metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )

    def forward(self, user_ids, item1_ids, item2_ids):
        user_ids = user_ids.long().to(device)
        item1_ids = item1_ids.to(device)
        item2_ids = item2_ids.to(device)
        
        # Retrieve the precomputed interactions and mask for each user in the batch
        batch_interactions = padded_user_interactions[user_ids]  # (batch_size, 256)
        batch_mask = user_interaction_mask[user_ids]            # (batch_size, 256)
        
        # Replace padded indices (-1) with 0 (or any valid index) before lookup.
        valid_batch_interactions = batch_interactions.clone()
        valid_batch_interactions[valid_batch_interactions < 0] = 0
        
        # Gather item metadata for all interactions in the batch.
        # item_embeddings_tensor has shape (num_items, item_metadata_dim)
        batch_item_metadata = item_embeddings_tensor[valid_batch_interactions]  # (batch_size, 256, item_metadata_dim)
        
        # Process the metadata through the user tower.
        processed = self.user_fc(batch_item_metadata)  # (batch_size, 256, embedding_dim)
        
        # Zero out padded positions using the mask.
        batch_mask_expanded = batch_mask.unsqueeze(-1)  # (batch_size, 256, 1)
        processed = processed * batch_mask_expanded
        
        # Compute the mean for valid interactions.
        user_embed = processed.sum(dim=1) / (batch_mask.sum(dim=1, keepdim=True) + 1e-8)
        
        # Process item towers.
        item1_embed = self.item_fc(item_embeddings_tensor[item1_ids])
        item2_embed = self.item_fc(item_embeddings_tensor[item2_ids])
        
        return user_embed, item1_embed, item2_embed
# ======= Pairwise BPR Loss =======


class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -F.logsigmoid(pos_score - neg_score).mean()
        return loss


### Pairwise cold -  Experiment 5 - Bert features using similarity

In [None]:
class TwoTowerModelBertFeaturesSimilarity(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Processes item metadata of past interactions)
        self.user_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.LeakyReLU(0.1),   # Changed activation to LeakyReLU
            nn.Linear(512, embedding_dim),
        )
        # Item Tower (Processes item metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.LeakyReLU(0.1),   # Changed activation to LeakyReLU
            nn.Linear(512, embedding_dim),
        )

    def forward(self, user_ids, item1_ids, item2_ids):
        user_ids = user_ids.long().to(device)
        item1_ids = item1_ids.to(device)
        item2_ids = item2_ids.to(device)
        
        # Retrieve precomputed interactions and mask for each user in the batch
        batch_interactions = padded_user_interactions[user_ids]  # (batch_size, 256)
        batch_mask = user_interaction_mask[user_ids]             # (batch_size, 256)
        
        # Create an exclusion mask: zero out positions where the history equals either candidate item.
        exclusion_mask = (
            (batch_interactions != item1_ids.unsqueeze(1)) &
            (batch_interactions != item2_ids.unsqueeze(1))
        ).float()  # shape: (batch_size, 256)
        
        # Combine the original mask with the exclusion mask.
        adjusted_mask = batch_mask * exclusion_mask  # (batch_size, 256)
        
        # Replace padded indices (-1) with 0 (a safe index) before lookup.
        valid_batch_interactions = batch_interactions.clone()
        valid_batch_interactions[valid_batch_interactions < 0] = 0
        
        # Gather item metadata for all interactions in the batch.
        batch_item_metadata = item_embeddings_tensor[valid_batch_interactions]  # (batch_size, 256, item_metadata_dim)
        
        # Process the user history through the user tower.
        processed = self.user_fc(batch_item_metadata)  # (batch_size, 256, embedding_dim)
        
        # Zero out padded/filtered positions using adjusted_mask.
        batch_mask_expanded = adjusted_mask.unsqueeze(-1)  # (batch_size, 256, 1)
        processed = processed * batch_mask_expanded
        
        # Instead of averaging, compute cosine similarities between candidate items and each history embedding.
        # Candidate 1:
        item1_embed = self.item_fc(item_embeddings_tensor[item1_ids])  # (batch_size, embedding_dim)
        sim1 = F.cosine_similarity(processed, item1_embed.unsqueeze(1), dim=2)  # (batch_size, 256)
        user_score1 = sim1.max(dim=1).values  # (batch_size,)
        
        # Candidate 2:
        item2_embed = self.item_fc(item_embeddings_tensor[item2_ids])
        sim2 = F.cosine_similarity(processed, item2_embed.unsqueeze(1), dim=2)
        user_score2 = sim2.max(dim=1).values  # (batch_size,)
        
        return user_score1, user_score2
class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, score1, score2, item1_ids, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss using candidate similarity scores.
        
        Args:
        - score1: Tensor of shape (batch_size,), score for candidate item1.
        - score2: Tensor of shape (batch_size,), score for candidate item2.
        - item1_ids: Tensor of shape (batch_size,), IDs of candidate item1.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.
        
        Returns:
        - loss: Computed BPR loss.
        """
        # Create binary labels: 1 if candidate 1 is the positive item, else 0.
        labels_binary = (labels == item1_ids).float()
        
        # Choose positive and negative scores based on the binary labels.
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)
        
        # Compute BPR loss based on the difference between positive and negative scores.
        loss = -F.logsigmoid(pos_score - neg_score).mean()
        return loss


# First models - were too computationaly expensive

In [1]:
class AdvancedMFModel(nn.Module):
    def __init__(self, 
        image_embeddings, 
        title_embeddings, 
        embedding_dim, 
        num_users, 
        num_items,
        device,
        num_hidden_layers=2, 
        hidden_dim_list=None, 
        activation_fn=nn.ReLU,
        dropout_rate=0.2,
        batch_norm=False):
        
        super(AdvancedMFModel, self).__init__()
        
        self.device = device

        # move to device
        self.image_embeddings = {k: v.to(self.device) for k, v in image_embeddings.items()}
        self.title_embeddings = {k: v.to(self.device) for k, v in title_embeddings.items()}
        
        self.user_embeddings = nn.Embedding(num_users, embedding_dim).to(self.device)         # Random user embeddings
        self.item_embeddings = nn.Embedding(num_items, embedding_dim).to(self.device) # Random item embeddings for collaborative signal

        if hidden_dim_list is None: # set  Default hidden dimensions if not provided
            hidden_dim_list = [embedding_dim * 2] * num_hidden_layers

        # Input dimension is the sum of random embeddings and metadata embeddings
        input_dim = embedding_dim + list(next(iter(image_embeddings.values())).shape)[0] + list(next(iter(title_embeddings.values())).shape)[0]

        # Dynamically build the FFN layers
        layers = []
        for i in range(num_hidden_layers):
            layers.append(nn.Linear(input_dim, hidden_dim_list[i]).to(self.device))
            if batch_norm:
                layers.append(nn.BatchNorm1d(hidden_dim_list[i]).to(self.device))
            layers.append(activation_fn())
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            input_dim = hidden_dim_list[i]
        layers.append(nn.Linear(input_dim, embedding_dim).to(self.device)) # Add final layer to project to embedding_dim -  this way the number of nodes in the last layers dont need to be manually defined

        self.ffn = nn.Sequential(*layers)

    def forward(self, user_ids, item_ids):
        user_ids = user_ids.to(self.device) # Move inputs to the correct device
        
        user_embeds = self.user_embeddings(user_ids).to(self.device) # Get User embeddings

        # Metadata embeddings lookup
        image_embeds = torch.stack([self.image_embeddings[int(item_id)] for item_id in item_ids]).to(self.device)
        title_embeds = torch.stack([self.title_embeddings[int(item_id)] for item_id in item_ids]).to(self.device)

        # Random collaborative item embeddings
        item_collab_embeds = self.item_embeddings(item_ids.to(torch.long).to(self.device)).to(self.device)

        # Concatenate random embeddings and metadata embeddings
        combined_embeds = torch.cat((item_collab_embeds, image_embeds, title_embeds), dim=1)

        # Feedforward network
        final_item_embeds = self.ffn(combined_embeds)

        # Compute dot product for recommendation scores
        scores = (user_embeds * final_item_embeds).sum(dim=1)
        return scores

In [None]:
class AdvancedMFModel2(nn.Module):
    def __init__(self, 
        image_embeddings, 
        title_embeddings, 
        embedding_dim, 
        num_users, 
        num_items,
        device,
        num_hidden_layers=2, 
        hidden_dim_list=None, 
        activation_fn=nn.ReLU,
        dropout_rate=0.2,
        batch_norm=False):
        
        super(AdvancedMFModel2, self).__init__()
        
        self.device = device

        # move to device
        self.image_embeddings = {k: v.to(self.device) for k, v in image_embeddings.items()}
        self.title_embeddings = {k: v.to(self.device) for k, v in title_embeddings.items()}
        
        self.user_embeddings = nn.Embedding(num_users, embedding_dim).to(self.device) # Random user embeddings
        nn.init.uniform_(self.user_embeddings.weight, 0, 0.5) # Initialize between 0 and 0.5 for convergence 
        self.item_embeddings = nn.Embedding(num_items, embedding_dim).to(self.device) # Random item embeddings for collaborative signal
        nn.init.uniform_(self.item_embeddings.weight, 0, 0.5) # Initialize between 0 and 0.5 convergence
        

        if hidden_dim_list is None: # set  Default hidden dimensions if not provided
            hidden_dim_list = [embedding_dim * 2] * num_hidden_layers

        # Input dimension is the sum of random embeddings and metadata embeddings
        input_dim = embedding_dim + list(next(iter(image_embeddings.values())).shape)[0] + list(next(iter(title_embeddings.values())).shape)[0]

        # Dynamically build the FFN layers
        layers = []
        for i in range(num_hidden_layers):
            layers.append(nn.Linear(input_dim, hidden_dim_list[i]).to(self.device))
            if batch_norm:
                layers.append(nn.BatchNorm1d(hidden_dim_list[i]).to(self.device))
            layers.append(activation_fn())
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            input_dim = hidden_dim_list[i]
        layers.append(nn.Linear(input_dim, embedding_dim).to(self.device)) # Add final layer to project to embedding_dim -  this way the number of nodes in the last layers dont need to be manually defined

        self.ffn = nn.Sequential(*layers)

    def forward(self, user_ids, item_ids):
        user_ids = user_ids.to(self.device) # Move inputs to the correct device
        
        user_embeds = self.user_embeddings(user_ids).to(self.device) # Get User embeddings

        # Metadata embeddings lookup
        image_embeds = torch.stack([self.image_embeddings[int(item_id)] for item_id in item_ids]).to(self.device)
        title_embeds = torch.stack([self.title_embeddings[int(item_id)] for item_id in item_ids]).to(self.device)

        # Random collaborative item embeddings
        item_collab_embeds = self.item_embeddings(item_ids.to(torch.long).to(self.device)).to(self.device)

        # Concatenate random embeddings and metadata embeddings
        combined_embeds = torch.cat((item_collab_embeds, image_embeds, title_embeds), dim=1)

        # Feedforward network
        final_item_embeds = self.ffn(combined_embeds)

        # Compute dot product for recommendation scores
        scores = (user_embeds * final_item_embeds).sum(dim=1)
        return scores

In [1]:
class ClassicMFModelWithSmartBias(nn.Module):
    def __init__(self, 
                 df2: pd.DataFrame,
                 num_users: int,
                 num_items: int,
                 embedding_dim: int,
                 device: str):
        
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.device = device

        # 1) Compute overall mean rating
        overall_mean = df2['rating'].mean()  # scalar float

        # 2) Compute average rating per user and per item (grouped by user_idx, item_idx)
        user_means = df2.groupby('user_idx')['rating'].mean()  # Series, index = user_idx
        item_means = df2.groupby('item_idx')['rating'].mean()  # Series, index = item_idx

        # Prepare bias initialization tensors
        user_bias_init = torch.zeros(num_users, dtype=torch.float32)
        item_bias_init = torch.zeros(num_items, dtype=torch.float32)
        
        # Fill in user bias init: (user_mean - overall_mean)
        for u_idx, mean_val in user_means.items():
            user_bias_init[u_idx] = mean_val - overall_mean
        # Fill in item bias init: (item_mean - overall_mean)
        for i_idx, mean_val in item_means.items():
            item_bias_init[i_idx] = mean_val - overall_mean

        # 3) Define Embeddings: user/item factors, user/item bias
        self.user_factors = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.item_factors = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_dim)

        # Each bias is an Embedding with dim=1
        self.user_bias = nn.Embedding(num_embeddings=num_users, embedding_dim=1)
        self.item_bias = nn.Embedding(num_embeddings=num_items, embedding_dim=1)

        # 4) Set initial bias values to "smart" init
        self.user_bias.weight.data = user_bias_init.unsqueeze(1)  # [num_users, 1]
        self.item_bias.weight.data = item_bias_init.unsqueeze(1)  # [num_items, 1]

        # 5) Global bias as a learnable parameter
        self.global_bias = nn.Parameter(torch.tensor([overall_mean], dtype=torch.float32))

        # 6) Optionally initialize user/item latent factors
        # nn.init.normal_(self.user_factors.weight, mean=0.0, std=0.01)
        # nn.init.normal_(self.item_factors.weight, mean=0.0, std=0.01)

    def forward(self, user_ids: torch.LongTensor, item_ids: torch.LongTensor):

        user_vecs = self.user_factors(user_ids)  # [batch_size, embedding_dim]
        item_vecs = self.item_factors(item_ids)  # [batch_size, embedding_dim]

        user_b = self.user_bias(user_ids).squeeze(dim=1)  # [batch_size]
        item_b = self.item_bias(item_ids).squeeze(dim=1)  # [batch_size]
        
        dot = (user_vecs * item_vecs).sum(dim=1)          # [batch_size]

        # 4) Combine all terms: global bias + user bias + item bias + dot
        preds = self.global_bias + user_b + item_b + dot
        return preds

In [None]:
class ClassicMFModelWithBias(nn.Module):
    def __init__(self, 
                 df2: pd.DataFrame,
                 num_users: int,
                 num_items: int,
                 embedding_dim: int,
                 device: str):
        
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.device = device

        # 1) Compute overall mean rating
        overall_mean = df2['rating'].mean()  # scalar float

        # 3) Define Embeddings: user/item factors, user/item bias
        self.user_factors = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.item_factors = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_dim)

        # 5) Global bias as a learnable parameter
        self.global_bias = nn.Parameter(torch.tensor([overall_mean], dtype=torch.float32))

        # 6) Optionally initialize user/item latent factors
        nn.init.normal_(self.user_factors.weight, mean=0.0, std=0.01)
        nn.init.normal_(self.item_factors.weight, mean=0.0, std=0.01)

    def forward(self, user_ids: torch.LongTensor, item_ids: torch.LongTensor):

        user_vecs = self.user_factors(user_ids)  # [batch_size, embedding_dim]
        item_vecs = self.item_factors(item_ids)  # [batch_size, embedding_dim]
        
        dot = (user_vecs * item_vecs).sum(dim=1)          # [batch_size]

        preds = self.global_bias + dot
        
        return preds

In [None]:
class ClassicMFModelWithSmartBiasClamp(nn.Module):
    def __init__(self, 
                 df2: pd.DataFrame,
                 num_users: int,
                 num_items: int,
                 embedding_dim: int = 32,
                 device: str = 'cpu'):
        
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.device = device

        # 1) Compute overall mean rating
        overall_mean = df2['rating'].mean()  # scalar float

        # 2) Compute average rating per user and per item (grouped by user_idx, item_idx)
        user_means = df2.groupby('user_idx')['rating'].mean()  # Series, index = user_idx
        item_means = df2.groupby('item_idx')['rating'].mean()  # Series, index = item_idx

        # Prepare bias initialization tensors
        user_bias_init = torch.zeros(num_users, dtype=torch.float32)
        item_bias_init = torch.zeros(num_items, dtype=torch.float32)
        
        # Fill in user bias init: (user_mean - overall_mean)
        for u_idx, mean_val in user_means.items():
            user_bias_init[u_idx] = mean_val - overall_mean
        # Fill in item bias init: (item_mean - overall_mean)
        for i_idx, mean_val in item_means.items():
            item_bias_init[i_idx] = mean_val - overall_mean

        # 3) Define Embeddings: user/item factors, user/item bias
        self.user_factors = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.item_factors = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_dim)

        # Each bias is an Embedding with dim=1
        self.user_bias = nn.Embedding(num_embeddings=num_users, embedding_dim=1)
        self.item_bias = nn.Embedding(num_embeddings=num_items, embedding_dim=1)

        # 4) Set initial bias values to "smart" init
        self.user_bias.weight.data = user_bias_init.unsqueeze(1)  # [num_users, 1]
        self.item_bias.weight.data = item_bias_init.unsqueeze(1)  # [num_items, 1]

        # 5) Global bias as a learnable parameter
        self.global_bias = nn.Parameter(torch.tensor([overall_mean], dtype=torch.float32))

        # 6) Optionally initialize user/item latent factors
        # nn.init.normal_(self.user_factors.weight, mean=0.0, std=0.01)
        # nn.init.normal_(self.item_factors.weight, mean=0.0, std=0.01)

    def forward(self, user_ids: torch.LongTensor, item_ids: torch.LongTensor):

        user_vecs = self.user_factors(user_ids)  # [batch_size, embedding_dim]
        item_vecs = self.item_factors(item_ids)  # [batch_size, embedding_dim]

        user_b = self.user_bias(user_ids).squeeze(dim=1)  # [batch_size]
        item_b = self.item_bias(item_ids).squeeze(dim=1)  # [batch_size]
        
        dot = (user_vecs * item_vecs).sum(dim=1)          # [batch_size]

        # 4) Combine all terms: global bias + user bias + item bias + dot
        preds = self.global_bias + user_b + item_b + dot

        # 5) Clamping to [1, 5]:
        preds = torch.clamp(preds, min=1.0, max=5.0)
        return preds

In [2]:
import torch
import torch.nn as nn

class AdvancedMFModelTextEmbed(nn.Module):
    def __init__(self, 
        image_embeddings, 
        text_embeddings, 
        embedding_dim, 
        num_users, 
        num_items,
        device,
        num_hidden_layers=2, 
        hidden_dim_list=None, 
        activation_fn=nn.ReLU,
        dropout_rate=0.2,
        batch_norm=False,
        reduced_text_dim=128):  # New parameter for reduced text embedding dim
        
        super(AdvancedMFModelTextEmbed, self).__init__()
        
        self.device = device

        # Move to device
        self.image_embeddings = {k: v.to(self.device) for k, v in image_embeddings.items()}
        self.text_embeddings = {k: v.to(self.device) for k, v in text_embeddings.items()}
        
        self.user_embeddings = nn.Embedding(num_users, embedding_dim).to(self.device) # Random user embeddings
        nn.init.uniform_(self.user_embeddings.weight, 0, 0.5) # Initialize between 0 and 0.5 for convergence 
        self.item_embeddings = nn.Embedding(num_items, embedding_dim).to(self.device) # Random item embeddings
        nn.init.uniform_(self.item_embeddings.weight, 0, 0.5) # Initialize between 0 and 0.5 for convergence

        # Projection layer for text embeddings
        self.text_projection = nn.Linear(1027, reduced_text_dim).to(self.device)

        if hidden_dim_list is None:  # Default hidden dimensions if not provided
            hidden_dim_list = [embedding_dim * 2] * num_hidden_layers

        # Compute input dimension after projection
        input_dim = embedding_dim + list(next(iter(image_embeddings.values())).shape)[0] + reduced_text_dim
        print(f"Input dim: {input_dim}")

        # Dynamically build the FFN layers
        layers = []
        for i in range(num_hidden_layers):
            layers.append(nn.Linear(input_dim, hidden_dim_list[i]).to(self.device))
            if batch_norm:
                layers.append(nn.BatchNorm1d(hidden_dim_list[i]).to(self.device))
            layers.append(activation_fn())
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            input_dim = hidden_dim_list[i]

        layers.append(nn.Linear(input_dim, embedding_dim).to(self.device)) # Final layer

        self.ffn = nn.Sequential(*layers)

    def forward(self, user_ids, item_ids):
        user_ids = user_ids.to(self.device) # Move inputs to the correct device
        
        user_embeds = self.user_embeddings(user_ids) # Get user embeddings

        # Metadata embeddings lookup
        image_embeds = torch.stack([self.image_embeddings[int(item_id)] for item_id in item_ids]).to(self.device)
        text_embeds = torch.stack([self.text_embeddings[int(item_id)] for item_id in item_ids]).to(self.device)

        # Apply the projection to reduce text embedding dimension
        text_embeds = self.text_projection(text_embeds)

        # Random collaborative item embeddings
        item_collab_embeds = self.item_embeddings(item_ids.to(torch.long).to(self.device))

        # Concatenate item embeddings and metadata embeddings
        combined_embeds = torch.cat((item_collab_embeds, image_embeds, text_embeds), dim=1)

        # Feedforward network
        final_item_embeds = self.ffn(combined_embeds)

        # Compute dot product for recommendation scores
        scores = (user_embeds * final_item_embeds).sum(dim=1)

        return scores
