# Hybrid NCF Model


The Hybrid NCF Mode combines two parallel Neural Collaborative Filtering models: one using custom embeddings derived from engineered user and product features, and the other using randomly initialised embeddings. Their outputs are merged through a fully connected layer, allowing the model to learn a weighted combination of both representations. This architecture is designed to capture both structured patterns and latent interactions for improved recommendation performance.

In [2]:
from google.colab import drive
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from collections import defaultdict
from itertools import product
import copy
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define project paths
project_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon'
data_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/data'

Mounted at /content/drive


# Step 1: Load dataset
Previously in step1_data_preprocessing.ipynb, We have split the df_reviews dataset into training, testing and validation samples for each user, following chronological order and using the early 70% of each user's interactions for training, followed by the next 15% for validation and the last 15% for testing.

In [None]:
train_data = pd.read_csv(os.path.join(data_dir,"train_data.csv"))
test_data = pd.read_csv(os.path.join(data_dir,"test_data.csv"))
val_data = pd.read_csv(os.path.join(data_dir,"val_data.csv"))

df_reviews = pd.read_csv(os.path.join(data_dir,"filtered_reviews_with_features_and_clusters.csv"))

# Filter out Customers and Products in test and val set that do not appear in training set

Prepares cluster-specific training, validation, and test data by:
- Extracting the subset of each dataframe corresponding to the given cluster ID.
- Mapping customer_id and product_id to 0-based index values within the cluster.
- Filtering val/test sets to only include users/items present in the cluster's training set.

Prevent Cold Start problems during validation and testing. If a customer or product appears only in the validation or test set appears only in the validataion or test set, the model has never seen it before and cannot generate a valid prediction. Thus we will remove any rows in the validation or test sets which does not belong to any user in training set or the product is absent in the training set.

We do not remove any rows from the training set.

In [None]:
def prepare_cluster_data(cluster_id, train_df, val_df=None, test_df=None):
    train_cluster = train_df[train_df['cluster'] == cluster_id].reset_index(drop=True)
    user2idx = {uid: idx for idx, uid in enumerate(train_cluster['customer_id'].unique())}
    item2idx = {pid: idx for idx, pid in enumerate(train_cluster['product_id'].unique())}

    train_cluster['user_idx'] = train_cluster['customer_id'].map(user2idx)
    train_cluster['item_idx'] = train_cluster['product_id'].map(item2idx)

    val_cluster, test_cluster = None, None

    if val_df is not None:
        val_cluster = val_df[val_df['cluster'] == cluster_id].reset_index(drop=True)
        val_cluster = val_cluster[
            val_cluster['customer_id'].isin(user2idx) &
            val_cluster['product_id'].isin(item2idx)
        ].reset_index(drop=True)
        val_cluster['user_idx'] = val_cluster['customer_id'].map(user2idx)
        val_cluster['item_idx'] = val_cluster['product_id'].map(item2idx)

    if test_df is not None:
        test_cluster = test_df[test_df['cluster'] == cluster_id].reset_index(drop=True)
        test_cluster = test_cluster[
            test_cluster['customer_id'].isin(user2idx) &
            test_cluster['product_id'].isin(item2idx)
        ].reset_index(drop=True)
        test_cluster['user_idx'] = test_cluster['customer_id'].map(user2idx)
        test_cluster['item_idx'] = test_cluster['product_id'].map(item2idx)


    return train_cluster, val_cluster, test_cluster, user2idx, item2idx

# Create Data Loader

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'customer_id': torch.tensor(row['user_idx'], dtype=torch.long),
            'product_id': torch.tensor(row['item_idx'], dtype=torch.long),
            'rating': torch.tensor(row['star_rating'], dtype=torch.float)
        }

# Building Customer Embeddings

The original **df_reviews** will be used to build the custom customer and product embeddings. These custom embeddings are meant to reflect historical behaviour or characteristics of customers/products

The customer embeddings will constructed by aggregating features within df_reviews by customer_id to find a customer's:
 - Purchase Frequency (Indicate how active a customer is)
 - Time Since Last Purchase (Indicate how active a customer is)
 - Average Star Rating (Overall Customer satisfaction across all of his purchases)
 - Total Vine Reviews (Measure of Credibility of his Reviews)
 - Total Helpful Votes (Measure the Credibility of his Reviews)
 - Total Votes (Measure the Credibility of his Reviews)
 - Average Sentiment (Overall Customer satisfaction across all of his purchases

These embeddings are more informative than a randomly intialized embedding in typical recommnedation systems. The choice of specific features injects domain knowledge into the model.


In [None]:
def build_customer_embeddings(df_reviews, embedding_dim, cache_dir, mode="grid"):
    os.makedirs(cache_dir, exist_ok=True)
    path = os.path.join(cache_dir, f'cust_emb_{mode}_{embedding_dim}.csv')
    if os.path.exists(path):
        return pd.read_csv(path, index_col='customer_id')

    agg = df_reviews.groupby('customer_id').agg({
        'monthly_purchase_frequency': 'mean',
        'time_since_last_purchase': 'mean',
        'star_rating': 'mean',
        'vine': lambda x: (x == 'Y').sum(),
        'helpful_votes': 'sum',
        'total_votes': 'sum',
        'sentiments': lambda x: (x == 'positive').mean()
    }).fillna(0).reset_index()

    X = StandardScaler().fit_transform(agg.drop(columns='customer_id'))
    pca = PCA(n_components=embedding_dim)
    X_pca = pca.fit_transform(X)
    df = pd.DataFrame(X_pca, index=agg['customer_id'])
    df.index.name = 'customer_id'
    df.to_csv(path)
    return df

## Building Product Embeddings

The product embeddings will constructed by aggregating features within df_reviews by product_id to find a product's:
- Mean Star Rating that it received (Customer satisfaction)
- Total Helpful Votes given to all its reviews (Quality of customer feedback)
- Total Votes given to all its reviews (Review Engagement by customers)
- Average sentiment (1 is Positive and 0 is Negative)
- Total Number of Vine Reviews (Number of Credible Reviews)
- Total Product Sales (Demand for Product)



In [None]:
def build_product_embeddings(df_reviews, embedding_dim, cache_dir, mode="grid"):
    os.makedirs(cache_dir, exist_ok=True)
    path = os.path.join(cache_dir, f'prod_emb_{mode}_{embedding_dim}.csv')
    if os.path.exists(path):
        return pd.read_csv(path, index_col='product_id')

    agg = df_reviews.groupby('product_id').agg({
        'star_rating': 'mean',
        'helpful_votes': 'sum',
        'total_votes': 'sum',
        'sentiments': lambda x: (x == 'positive').mean(),
        'vine': lambda x: (x == 'Y').sum(),
        'product_id': 'count'
    }).rename(columns={'product_id': 'sales_volume'}).fillna(0).reset_index()

    X = StandardScaler().fit_transform(agg.drop(columns='product_id'))
    pca = PCA(n_components=embedding_dim)
    X_pca = pca.fit_transform(X)
    df = pd.DataFrame(X_pca, index=agg['product_id'])
    df.index.name = 'product_id'
    df.to_csv(path)
    return df

# Evaluation Functions

- **ndcg_at_k**: Computes the Normalized Discounted Cumulative Gain (NDCG) at rank k for a single list of relevance. If the list contains fewer than k items, it will use actual_k = min(k, len(relevances)) to ensure fair computation.

- **mean_ndcg_user_at_k**: Computes the mean NDCG@k across all users by grouping predicted scores and relevance labels per user, sorting by prediction, and applying ndcg_at_k. For each user, their items are sorted by predicted scores, and NDCG is computed using `ndcg_at_k` with actual_k = min(k, len(user_items)).

- **mean_precision_user_at_k**: Computes the mean Precision@k across all users.
Precision@k is the proportion of relevant items (e.g., rating ≥ threshold) among the top-k predicted items for each user. For each user, top-k items are selected based on predicted scores. If the user has fewer than k items, actual_k = min(k, len(user_items)) is used.  
  Precision is calculated as:  
  `precision = (# of relevant items among top-k) / actual_k`  
  where an item is considered relevant if `rating ≥ threshold`.

- **mean_recall_user_at_k**: Computes the mean Recall@k across all users.
Recall@k is the proportion of a user's relevant items (rating ≥ threshold) that are retrieved in the top-k predicted list. For each user, top-k items are selected based on predicted scores, and recall is calculated as:  
  `recall = (# of relevant items among top-k) / total number of relevant items for the user`  
  actual_k = min(k, len(user_items)) is used to handle users with fewer than k items.

- **mean_f1_user_at_k**:  
  Computes the mean F1@k across all users, where F1 combines precision and recall.  
  For each user, top-k items are selected (using actual_k = min(k, len(user_items))), and F1 is calculated based on binarized relevance labels (`rating ≥ threshold`).  
  The predicted labels are assumed to be all 1s (e.g top-k are predicted as relevant).

In [None]:
def ndcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=np.float32)
    actual_k = min(k, len(relevances))
    if actual_k == 0:
        return 0.0
    relevances = relevances[:actual_k]
    dcg = np.sum((2 ** relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum((2 ** ideal_relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def mean_ndcg_user_at_k(users, preds, labels, k=10):
    user_data = defaultdict(list)
    for u, p, l in zip(users, preds, labels):
        user_data[u].append((p, l))
    ndcg_scores = []
    for user_items in user_data.values():
        sorted_items = sorted(user_items, key=lambda x: x[0], reverse=True)
        rels = [r for _, r in sorted_items]
        ndcg_scores.append(ndcg_at_k(rels, k))
    return np.mean(ndcg_scores) if ndcg_scores else 0.0

def mean_precision_user_at_k(users, preds, labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, p, l in zip(users, preds, labels):
        user_data[u].append((p, l))
    scores = []
    for items in user_data.values():
        sorted_items = sorted(items, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(sorted_items))
        top_k = sorted_items[:actual_k]
        rels = [1 if l >= threshold else 0 for _, l in top_k]
        scores.append(np.sum(rels) / actual_k if actual_k > 0 else 0)
    return np.mean(scores) if scores else 0.0

def mean_recall_user_at_k(users, preds, labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, p, l in zip(users, preds, labels):
        user_data[u].append((p, l))
    scores = []
    for items in user_data.values():
        sorted_items = sorted(items, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(sorted_items))
        top_k = sorted_items[:actual_k]
        total_rel = sum(1 for _, l in items if l >= threshold)
        top_k_rel = sum(1 for _, l in top_k if l >= threshold)
        scores.append(top_k_rel / total_rel if total_rel > 0 else 0.0)
    return np.mean(scores) if scores else 0.0

def mean_f1_user_at_k(users, preds, labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, p, l in zip(users, preds, labels):
        user_data[u].append((p, l))
    scores = []
    for items in user_data.values():
        sorted_items = sorted(items, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(sorted_items))
        y_true = [1 if l >= threshold else 0 for _, l in sorted_items[:actual_k]]
        y_pred = [1] * actual_k
        scores.append(f1_score(y_true, y_pred, zero_division=0))
    return np.mean(scores) if scores else 0.0

def mse(preds, labels):
    return np.mean((np.array(preds) - np.array(labels)) ** 2)

def rmse(preds, labels):
    return np.sqrt(mse(preds, labels))

# Model Architecture

Defines the three neural collaborative filtering models:

1. **CustomNCF** uses fixed pre-computed embeddings (from PCA).

2. **RandomNCF** learns user/item embeddings from scratch.

3. **FullyConnectedNCF** takes the scalar outputs of both models and passes them through a fully connected layer to make a final prediction. This represents the hybrid architecture.

In [None]:
class CustomNCF(nn.Module):
    def __init__(self, customer_embedding_matrix_gmf, product_embedding_matrix_gmf,
                 customer_embedding_matrix_mlp, product_embedding_matrix_mlp, embedding_dim, dropout_rate = 0.3):
        super().__init__()
        # GMF Components
        self.customer_embeddings_gmf = nn.Embedding.from_pretrained(customer_embedding_matrix_gmf, freeze=False)
        self.product_embeddings_gmf = nn.Embedding.from_pretrained(product_embedding_matrix_gmf, freeze=False)

        # MLP Components
        self.customer_embeddings_mlp = nn.Embedding.from_pretrained(customer_embedding_matrix_mlp, freeze=False)
        self.product_embeddings_mlp = nn.Embedding.from_pretrained(product_embedding_matrix_mlp, freeze=False)

        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.bn1_mlp = nn.BatchNorm1d(128)
        self.dropout1_mlp = nn.Dropout(dropout_rate)

        self.fc2_mlp = nn.Linear(128, 64)
        self.bn2_mlp = nn.BatchNorm1d(64)
        self.dropout2_mlp = nn.Dropout(dropout_rate)

        # Final layers
        self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
        self.bn1_combined = nn.BatchNorm1d(128)
        self.dropout1_combined = nn.Dropout(dropout_rate)

        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, customer_id, product_id):
        # GMF
        customer_emb_gmf = self.customer_embeddings_gmf(customer_id)
        product_emb_gmf = self.product_embeddings_gmf(product_id)
        gmf_output = customer_emb_gmf * product_emb_gmf

        # MLP
        customer_emb_mlp = self.customer_embeddings_mlp(customer_id)
        product_emb_mlp = self.product_embeddings_mlp(product_id)
        mlp_input = torch.cat([customer_emb_mlp, product_emb_mlp], dim=-1)

        mlp_output = F.relu(self.bn1_mlp(self.fc1_mlp(mlp_input)))
        mlp_output = self.dropout1_mlp(mlp_output)

        mlp_output = F.relu(self.bn2_mlp(self.fc2_mlp(mlp_output)))
        mlp_output = self.dropout2_mlp(mlp_output)

        # Combine GMF and MLP
        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = F.relu(self.bn1_combined(self.fc1_combined(combined_input)))
        combined_output = self.dropout1_combined(combined_output)

        # Final layer & output scaling (1–5 range)
        output = self.fc2_combined(combined_output)
        return output.squeeze() * 4 + 1

In [None]:
class RandomNCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, dropout_rate=0.3):
        super().__init__()
        # Randomly initialized embedding layers
        self.customer_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.product_embeddings_gmf = nn.Embedding(num_items, embedding_dim)

        self.customer_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.product_embeddings_mlp = nn.Embedding(num_items, embedding_dim)

        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.bn1_mlp = nn.BatchNorm1d(128)
        self.dropout1_mlp = nn.Dropout(dropout_rate)

        self.fc2_mlp = nn.Linear(128, 64)
        self.bn2_mlp = nn.BatchNorm1d(64)
        self.dropout2_mlp = nn.Dropout(dropout_rate)

        self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
        self.bn1_combined = nn.BatchNorm1d(128)
        self.dropout1_combined = nn.Dropout(dropout_rate)

        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, customer_id, product_id):
        customer_emb_gmf = self.customer_embeddings_gmf(customer_id)
        product_emb_gmf = self.product_embeddings_gmf(product_id)
        gmf_output = customer_emb_gmf * product_emb_gmf

        customer_emb_mlp = self.customer_embeddings_mlp(customer_id)
        product_emb_mlp = self.product_embeddings_mlp(product_id)
        mlp_input = torch.cat([customer_emb_mlp, product_emb_mlp], dim=-1)

        mlp_output = F.relu(self.bn1_mlp(self.fc1_mlp(mlp_input)))
        mlp_output = self.dropout1_mlp(mlp_output)
        mlp_output = F.relu(self.bn2_mlp(self.fc2_mlp(mlp_output)))
        mlp_output = self.dropout2_mlp(mlp_output)

        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = F.relu(self.bn1_combined(self.fc1_combined(combined_input)))
        combined_output = self.dropout1_combined(combined_output)

        output = self.fc2_combined(combined_output)
        return output.squeeze() * 4 + 1

In [None]:
class FullyConnectedNCF(nn.Module):
    def __init__(self, custom_ncf_model, random_ncf_model):
        super(FullyConnectedNCF, self).__init__()
        self.custom_ncf = custom_ncf_model
        self.random_ncf = random_ncf_model

        # Combine the two scalar outputs => input dim = 2
        self.fc = nn.Sequential(
            nn.Linear(2, 8),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(8, 1)
        )
        self.device = next(self.parameters()).device

    def forward(self, customer_id, product_id):
        out_custom = self.custom_ncf(customer_id, product_id).unsqueeze(-1)  # shape: [batch_size, 1]
        out_random = self.random_ncf(customer_id, product_id).unsqueeze(-1)  # shape: [batch_size, 1]

        combined = torch.cat([out_custom, out_random], dim=-1)  # shape: [batch_size, 2]
        out = self.fc(combined)  # shape: [batch_size, 1]

        # Scale output from (0,1) to (1,5)
        return out.squeeze() * 4 + 1

# evaluate_model
Evaluates a model using MSE, RMSE, and ranking-based metrics (NDCG@10, Precision@10, Recall@10, F1@10). Assumes each user has a variable number of predictions, and top-k logic is used per user.

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    preds, labels, users = [], [], []
    with torch.no_grad():
        for batch in dataloader:
            user = batch['customer_id'].to(model.device)
            item = batch['product_id'].to(model.device)
            rating = batch['rating'].to(model.device)
            output = model(user, item)
            preds.extend(output.cpu().numpy())
            labels.extend(rating.cpu().numpy())
            users.extend(user.cpu().numpy())

    metrics = {
        'mse': mse(preds, labels),
        'rmse': rmse(preds, labels),
        'ndcg@10': mean_ndcg_user_at_k(users, preds, labels),
        'precision@10': mean_precision_user_at_k(users, preds, labels),
        'recall@10': mean_recall_user_at_k(users, preds, labels),
        'f1@10': mean_f1_user_at_k(users, preds, labels)
    }
    return metrics

# train_and_evaluate
Performs one round of training on the current grid search configuration for a single cluster. Tracks the best model state using early stopping (based on validation MSE). Returns the best model weights and metrics.

In [None]:
def train_and_evaluate(config, cluster_id, train_df, val_df, test_df, df_reviews):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    cluster_dir = os.path.join(project_dir, f"Model Results/NCF Hybrid Embedding/Cluster_{cluster_id}")
    os.makedirs(cluster_dir + "/cache", exist_ok=True)

    train_cluster, val_cluster, test_cluster, user2idx, item2idx = prepare_cluster_data(
        cluster_id, train_df, val_df, test_df
    )

    # Embeddings
    cust_emb = torch.tensor(
        build_customer_embeddings(train_cluster, int(config['custom_embedding_dim']), cluster_dir + "/cache", mode = "grid").values,
        dtype=torch.float32
    )
    prod_emb = torch.tensor(
        build_product_embeddings(train_cluster, int(config['custom_embedding_dim']), cluster_dir + "/cache", mode = "grid").values,
        dtype=torch.float32
    )

    custom_model = CustomNCF(
        cust_emb,          # GMF customer embedding
        prod_emb,          # GMF product embedding
        cust_emb.clone(),  # MLP customer embedding
        prod_emb.clone(),  # MLP product embedding
        int(config['custom_embedding_dim']),
        config['custom_dropout_rate']
    ).to(device)
    random_model = RandomNCF(
        num_users=len(user2idx),
        num_items=len(item2idx),
        embedding_dim=int(config['random_embedding_dim']),
        dropout_rate=config['random_dropout_rate']
    ).to(device)
    ensemble_model = FullyConnectedNCF(custom_model, random_model).to(device)

    criterion = nn.MSELoss()
    optim_custom = torch.optim.Adam(custom_model.parameters(), lr=config['custom_learning_rate'])
    optim_random = torch.optim.Adam(random_model.parameters(), lr=config['random_learning_rate'])
    optim_ensemble = torch.optim.Adam(ensemble_model.parameters(), lr=config['fc_learning_rate'])

    train_loader = DataLoader(ReviewsDataset(train_cluster), batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(ReviewsDataset(val_cluster), batch_size=config['batch_size'])

    best_val_loss = float('inf')
    best_state = None
    best_metrics = {}
    patience = config.get('patience', 5)
    no_improve = 0

    for epoch in tqdm(range(config['num_epochs']), desc=f"Cluster {cluster_id} - Grid Search Epochs"):
        custom_model.train()
        random_model.train()
        ensemble_model.train()
        for batch in train_loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            rating = batch['rating'].to(device)

            # out_custom = custom_model(user, item)
            # out_random = random_model(user, item)
            # final_out = ensemble_model(user, item)
            final_out = ensemble_model(user, item)
            loss = criterion(final_out, rating)

            optim_custom.zero_grad()
            optim_random.zero_grad()
            optim_ensemble.zero_grad()

            loss.backward()
            optim_custom.step()
            optim_random.step()
            optim_ensemble.step()

        ensemble_model.eval()
        custom_model.eval()
        random_model.eval()

        preds, labels, users = [], [], []
        with torch.no_grad():
            for batch in val_loader:
                user = batch['customer_id'].to(device)
                item = batch['product_id'].to(device)
                rating = batch['rating'].to(device)
                final_out = ensemble_model(user, item)
                preds.extend(final_out.cpu().numpy())
                labels.extend(rating.cpu().numpy())
                users.extend(user.cpu().numpy())

        current_mse = mse(preds, labels)
        if current_mse < best_val_loss:
            best_val_loss = current_mse
            best_state = {
                'custom_model': copy.deepcopy(custom_model.state_dict()),
                'random_model': copy.deepcopy(random_model.state_dict()),
                'ensemble_model': copy.deepcopy(ensemble_model.state_dict())
            }
            best_metrics = evaluate_model(ensemble_model, val_loader)
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    return best_state, best_metrics

# Perform Grid Search for All Clusters
In step4_customer_segmentation.ipynb file, we have segmented customers in 4 categories, namely steady and satified customers, power buyers, engaged shoppers, casual buyers. Customers in each cluster have distinct purchasing behaviours and preferences thus we will be building a NCF model for each cluster. This will allow us to provided tailored recommendations to different customer groups based on their past purchasing behaviour.

**run_grid_search_all_clusters(...)**
Performs grid search across all clusters. For each cluster and config, it:

1. Calls train_and_evaluate

2. Tracks the best configuration based on validation MSE

3. Saves model weights and grid search results into Cluster_{id} folders


In [None]:
def run_grid_search_all_clusters(param_grid, train_data, val_data, test_data, df_reviews):
    cluster_ids = sorted(train_data['cluster'].unique())

    for cluster_id in cluster_ids:
        print(f"\n===== Running Grid Search for Cluster {cluster_id} =====")
        results = []
        best_overall_loss = float('inf')
        best_overall_config = None
        best_overall_metrics = None
        best_state_dict = None

        for values in tqdm(list(product(*param_grid.values())), desc=f"Grid Search for Cluster {cluster_id}"):
            config = dict(zip(param_grid.keys(), values))
            try:
                state_dict, metrics = train_and_evaluate(
                    config, cluster_id, train_data, val_data, test_data, df_reviews
                )
                row = metrics.copy()
                row.update(config)
                results.append(row)

                if metrics['mse'] < best_overall_loss:
                    best_overall_loss = metrics['mse']
                    best_overall_config = config
                    best_overall_metrics = metrics
                    best_state_dict = state_dict

            except Exception as e:
                print(f"[ERROR] Skipping config {config} due to error: {e}")

        result_dir = os.path.join(project_dir, f"Model Results/NCF Hybrid Embedding/Cluster_{cluster_id}")
        os.makedirs(result_dir, exist_ok=True)
        pd.DataFrame(results).to_csv(os.path.join(result_dir, "grid_search_log.csv"), index=False)
        pd.DataFrame([best_overall_metrics | best_overall_config]).to_csv(os.path.join(result_dir, "val_results_by_cluster.csv"), index=False)

        # Save model weights
        torch.save(best_state_dict['custom_model'], os.path.join(result_dir, "best_custom_model.pt"))
        torch.save(best_state_dict['random_model'], os.path.join(result_dir, "best_random_model.pt"))
        torch.save(best_state_dict['ensemble_model'], os.path.join(result_dir, "best_ensemble_model.pt"))

        with open(os.path.join(result_dir, "results_by_cluster.txt"), "w") as f:
            f.write("===== Best Config and Validation Metrics =====\n")
            f.write(f"Cluster: {cluster_id}\n")
            for k, v in best_overall_config.items():
                f.write(f"{k}: {v}\n")
            f.write("\nValidation Metrics:\n")
            for k, v in best_overall_metrics.items():
                f.write(f"{k}: {v:.4f}\n")


#retrain_and_test_best_models
Retrains the best model configuration (from grid search) on the combined train+val dataset and then evaluates it on the test set. Test metrics are saved to test_results_by_cluster.csv.

In [None]:
def retrain_and_test_best_models(train_data, val_data, test_data, df_reviews):
    cluster_ids = sorted(train_data['cluster'].unique())
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for cluster_id in cluster_ids:
        print(f"\n===== Retraining on Train+Val and Testing for Cluster {cluster_id} =====")
        result_dir = os.path.join(project_dir, f"Model Results/NCF Hybrid Embedding/Cluster_{cluster_id}")
        config_df = pd.read_csv(os.path.join(result_dir, "val_results_by_cluster.csv"))
        config_df = config_df.sort_values(by="rmse")
        config = config_df.iloc[0].to_dict()

        full_train = pd.concat([
            train_data[train_data['cluster'] == cluster_id],
            val_data[val_data['cluster'] == cluster_id]
        ]).reset_index(drop=True)

        full_train, _, test_cluster, user2idx, item2idx = prepare_cluster_data(
            cluster_id, full_train, None, test_data
        )

        cust_emb = torch.tensor(
            build_customer_embeddings(full_train, int(config['custom_embedding_dim']), result_dir + "/cache", mode="retrain").values,
            dtype=torch.float32
        ).to(device)
        prod_emb = torch.tensor(
            build_product_embeddings(full_train, int(config['custom_embedding_dim']), result_dir + "/cache", mode="retrain").values,
            dtype=torch.float32
        ).to(device)

        custom_model = CustomNCF(
            cust_emb,
            prod_emb,
            cust_emb.clone(),
            prod_emb.clone(),
            embedding_dim=int(config['custom_embedding_dim']),
            dropout_rate=config['custom_dropout_rate']
        ).to(device)

        random_model = RandomNCF(
            num_users=len(user2idx),
            num_items=len(item2idx),
            embedding_dim=int(config['random_embedding_dim']),
            dropout_rate=config['random_dropout_rate']
        ).to(device)

        ensemble_model = FullyConnectedNCF(custom_model, random_model).to(device)

        criterion = nn.MSELoss()
        optim_custom = torch.optim.Adam(custom_model.parameters(), lr=config['custom_learning_rate'])
        optim_random = torch.optim.Adam(random_model.parameters(), lr=config['random_learning_rate'])
        optim_ensemble = torch.optim.Adam(ensemble_model.parameters(), lr=config['fc_learning_rate'])

        loader = DataLoader(ReviewsDataset(full_train), batch_size=int(config['batch_size']), shuffle=True)
        custom_model.train()
        random_model.train()
        ensemble_model.train()

        for epoch in range(int(config['num_epochs'])):
            for batch in loader:
                user = batch['customer_id'].to(device)
                item = batch['product_id'].to(device)
                rating = batch['rating'].to(device)

                final_out = ensemble_model(user, item)
                loss = criterion(final_out, rating)

                optim_custom.zero_grad()
                optim_random.zero_grad()
                optim_ensemble.zero_grad()
                loss.backward()
                optim_custom.step()
                optim_random.step()
                optim_ensemble.step()

        test_loader = DataLoader(ReviewsDataset(test_cluster), batch_size=int(config['batch_size']))
        test_metrics = evaluate_model(ensemble_model, test_loader)
        non_metric_config = {k: v for k, v in config.items() if k not in test_metrics}
        combined_row = {"cluster_id":cluster_id,**test_metrics,**non_metric_config}

        pd.DataFrame([combined_row]).to_csv(
            os.path.join(result_dir, "test_results_by_cluster.csv"),
            index=False
        )

        with open(os.path.join(result_dir, "results_by_cluster.txt"), "a") as f:
            f.write("\n===== Final Test Metrics =====\n")
            for k, v in test_metrics.items():
                f.write(f"{k}: {v:.4f}\n")


# run_full_pipeline

Main entry point for the script. It:

1. Defines a parameter grid

2. Runs run_grid_search_all_clusters(...)

3. Executes retrain_and_test_best_models(...)

In [None]:
def run_full_pipeline():
    param_grid = {
        'custom_embedding_dim': [5, 6],
        'random_embedding_dim': [5, 8],
        'custom_dropout_rate': [0.0, 0.3],
        'random_dropout_rate': [0.0],
        'custom_learning_rate': [0.001],
        'random_learning_rate': [0.001],
        'fc_dropout_rate': [0.0, 0.3],
        'fc_learning_rate': [0.001],
        'batch_size': [128, 512],
        'num_epochs': [20, 40]
    }

    print("\n========== STARTING GRID SEARCH ==========")
    run_grid_search_all_clusters(param_grid, train_data, val_data, test_data, df_reviews)
    print("\n========== STARTING FINAL RETRAINING AND TESTING ==========")
    retrain_and_test_best_models(train_data, val_data, test_data, df_reviews)
    print("\n========== PIPELINE COMPLETE ==========")

In [None]:
run_full_pipeline()
# Note: The error below occurred after the grid search phase completed successfully, but before the retraining and testing phase could begin.
# Since the grid search took approximately 6 hours to finish, re-running the entire pipeline would be highly time-consuming.
# Therefore, instead of executing `run_full_pipeline()` again, I will proceed directly with `retrain_and_test_best_models()` using the stored best configurations
# and embeddings generated during grid search. These were saved to disk and remain valid for retraining and evaluation.



===== Running Grid Search for Cluster 0 =====


Grid Search for Cluster 0:   0%|          | 0/64 [00:00<?, ?it/s]
Cluster 0 - Grid Search Epochs:   0%|          | 0/20 [00:00<?, ?it/s][A
Cluster 0 - Grid Search Epochs:   5%|▌         | 1/20 [00:23<07:22, 23.30s/it][A
Cluster 0 - Grid Search Epochs:  10%|█         | 2/20 [00:47<07:09, 23.84s/it][A
Cluster 0 - Grid Search Epochs:  15%|█▌        | 3/20 [00:56<04:49, 17.04s/it][A
Cluster 0 - Grid Search Epochs:  20%|██        | 4/20 [01:06<03:48, 14.26s/it][A
Cluster 0 - Grid Search Epochs:  25%|██▌       | 5/20 [01:16<03:10, 12.73s/it][A
Cluster 0 - Grid Search Epochs:  30%|███       | 6/20 [01:35<03:43, 15.93s/it]
Grid Search for Cluster 0:   2%|▏         | 1/64 [01:42<1:47:07, 102.02s/it]
Cluster 0 - Grid Search Epochs:   0%|          | 0/40 [00:00<?, ?it/s][A
Cluster 0 - Grid Search Epochs:   2%|▎         | 1/40 [00:22<14:40, 22.59s/it][A
Cluster 0 - Grid Search Epochs:   5%|▌         | 2/40 [00:32<09:38, 15.22s/it][A
Cluster 0 - Grid Search Epochs:   8%|▊         | 3/40 [0


===== Running Grid Search for Cluster 1 =====


Grid Search for Cluster 1:   0%|          | 0/64 [00:00<?, ?it/s]
Cluster 1 - Grid Search Epochs:   0%|          | 0/20 [00:00<?, ?it/s][A
Cluster 1 - Grid Search Epochs:   5%|▌         | 1/20 [00:05<01:50,  5.80s/it][A
Cluster 1 - Grid Search Epochs:  10%|█         | 2/20 [00:10<01:32,  5.15s/it][A
Cluster 1 - Grid Search Epochs:  15%|█▌        | 3/20 [00:15<01:28,  5.20s/it][A
Cluster 1 - Grid Search Epochs:  20%|██        | 4/20 [00:18<01:06,  4.19s/it][A
Cluster 1 - Grid Search Epochs:  25%|██▌       | 5/20 [00:20<00:51,  3.43s/it][A
Cluster 1 - Grid Search Epochs:  30%|███       | 6/20 [00:22<00:41,  2.97s/it][A
Cluster 1 - Grid Search Epochs:  35%|███▌      | 7/20 [00:26<00:49,  3.82s/it]
Grid Search for Cluster 1:   2%|▏         | 1/64 [00:28<30:23, 28.95s/it]
Cluster 1 - Grid Search Epochs:   0%|          | 0/40 [00:00<?, ?it/s][A
Cluster 1 - Grid Search Epochs:   2%|▎         | 1/40 [00:05<03:49,  5.88s/it][A
Cluster 1 - Grid Search Epochs:   5%|▌         | 2/40 [00:0


===== Running Grid Search for Cluster 2 =====


Grid Search for Cluster 2:   0%|          | 0/64 [00:00<?, ?it/s]
Cluster 2 - Grid Search Epochs:   0%|          | 0/20 [00:00<?, ?it/s][A
Cluster 2 - Grid Search Epochs:   5%|▌         | 1/20 [00:05<01:40,  5.31s/it][A
Cluster 2 - Grid Search Epochs:  10%|█         | 2/20 [00:09<01:23,  4.67s/it][A
Cluster 2 - Grid Search Epochs:  15%|█▌        | 3/20 [00:12<01:02,  3.67s/it][A
Cluster 2 - Grid Search Epochs:  20%|██        | 4/20 [00:17<01:08,  4.29s/it][A
Cluster 2 - Grid Search Epochs:  25%|██▌       | 5/20 [00:19<00:54,  3.66s/it][A
Cluster 2 - Grid Search Epochs:  30%|███       | 6/20 [00:22<00:45,  3.23s/it][A
Cluster 2 - Grid Search Epochs:  35%|███▌      | 7/20 [00:24<00:38,  2.97s/it][A
Cluster 2 - Grid Search Epochs:  40%|████      | 8/20 [00:30<00:45,  3.82s/it]
Grid Search for Cluster 2:   2%|▏         | 1/64 [00:32<34:07, 32.50s/it]
Cluster 2 - Grid Search Epochs:   0%|          | 0/40 [00:00<?, ?it/s][A
Cluster 2 - Grid Search Epochs:   2%|▎         | 1/40 [00:0


===== Running Grid Search for Cluster 3 =====


Grid Search for Cluster 3:   0%|          | 0/64 [00:00<?, ?it/s]
Cluster 3 - Grid Search Epochs:   0%|          | 0/20 [00:00<?, ?it/s][A
Cluster 3 - Grid Search Epochs:   5%|▌         | 1/20 [00:12<04:00, 12.66s/it][A
Cluster 3 - Grid Search Epochs:  10%|█         | 2/20 [00:25<03:46, 12.61s/it][A
Cluster 3 - Grid Search Epochs:  15%|█▌        | 3/20 [00:37<03:34, 12.63s/it][A
Cluster 3 - Grid Search Epochs:  20%|██        | 4/20 [00:42<02:34,  9.64s/it][A
Cluster 3 - Grid Search Epochs:  25%|██▌       | 5/20 [00:47<01:56,  7.75s/it][A
Cluster 3 - Grid Search Epochs:  30%|███       | 6/20 [01:00<02:12,  9.46s/it][A
Cluster 3 - Grid Search Epochs:  35%|███▌      | 7/20 [01:05<01:44,  8.01s/it][A
Cluster 3 - Grid Search Epochs:  40%|████      | 8/20 [01:18<01:54,  9.57s/it][A
Cluster 3 - Grid Search Epochs:  45%|████▌     | 9/20 [01:22<01:29,  8.09s/it][A
Cluster 3 - Grid Search Epochs:  50%|█████     | 10/20 [01:27<01:09,  6.93s/it][A
Cluster 3 - Grid Search Epochs:  55%|██



===== Retraining on Train+Val and Testing for Cluster 0 =====


TypeError: empty(): argument 'size' failed to unpack the object at pos 2 with error "type must be tuple of ints,but got float"

In [None]:
print("\n========== CONTINUING WITH FINAL RETRAINING AND TESTING ==========")
retrain_and_test_best_models(train_data, val_data, test_data, df_reviews)
print("\n========== PIPELINE COMPLETE ==========")



===== Retraining on Train+Val and Testing for Cluster 0 =====

===== Retraining on Train+Val and Testing for Cluster 1 =====

===== Retraining on Train+Val and Testing for Cluster 2 =====

===== Retraining on Train+Val and Testing for Cluster 3 =====



# Summary of Files

Each cluster’s output will be stored under:
/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results/NCF Hybrid Embedding/Cluster_{cluster_id}/

Created Files and Their Paths:

**grid_search_log.csv**
→ Cluster_{cluster_id}/grid_search_log.csv  
→ Full results from grid search (one row per config)

**val_results_by_cluster.csv**
→ Cluster_{cluster_id}/val_results_by_cluster.csv  
→ Best config (lowest validation MSE) and its metrics

**test_results_by_cluster.csv**
→ Cluster_{cluster_id}/test_results_by_cluster.csv  
→ Test results after final retraining using the best config

**results_by_cluster.txt**
→ Cluster_{cluster_id}/results_by_cluster.txt  
→ Human-readable summary of validation + test metrics

**best_custom_model.pt**
→ Cluster_{cluster_id}/best_custom_model.pt  
→ PyTorch weights of the best CustomNCF model

**best_random_model.pt**
→ Cluster_{cluster_id}/best_random_model.pt  
→ PyTorch weights of the best RandomNCF model

**best_ensemble_model.pt**
→ Cluster_{cluster_id}/best_ensemble_model.pt  
→ PyTorch weights of the best FullyConnectedNCF model

**cust_emb_{dim}.csv**
→ Cluster_{cluster_id}/cache/cust_emb_{dim}.csv  
→ Cached custom user embeddings (PCA-reduced, per embedding_dim)

**prod_emb_{dim}.csv**
→ Cluster_{cluster_id}/cache/prod_emb_{dim}.csv  
→ Cached custom product embeddings (PCA-reduced, per embedding_dim)

(Note: replace {cluster_id} with the cluster index, and {dim} with the embedding dimension used.)


# Check Stored Validation and Testing Results

In [3]:
base_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results/NCF Hybrid Embedding'
cluster_ids = sorted([int(d.split('_')[1]) for d in os.listdir(base_dir) if d.startswith('Cluster_')])

# Collect results
val_results_all = []
test_results_all = []

for cid in cluster_ids:
    cluster_dir = os.path.join(base_dir, f'Cluster_{cid}')

    # Load val results
    val_path = os.path.join(cluster_dir, 'val_results_by_cluster.csv')
    if os.path.exists(val_path):
        df_val = pd.read_csv(val_path)
        val_results_all.append(df_val)

    # Load test results
    test_path = os.path.join(cluster_dir, 'test_results_by_cluster.csv')
    if os.path.exists(test_path):
        df_test = pd.read_csv(test_path)
        test_results_all.append(df_test)

# Combine into two dataframes
val_df = pd.concat(val_results_all, ignore_index=True)
test_df = pd.concat(test_results_all, ignore_index=True)

print("Validation Results by Cluster:")
display(val_df)

print("Test Results by Cluster:")
display(test_df)

Validation Results by Cluster:


Unnamed: 0,cluster_id,mse,rmse,ndcg@10,precision@10,recall@10,f1@10,custom_embedding_dim,random_embedding_dim,custom_dropout_rate,random_dropout_rate,custom_learning_rate,random_learning_rate,fc_dropout_rate,fc_learning_rate,batch_size,num_epochs
0,0,0.77575,0.880766,0.997122,0.888622,0.896832,0.891358,5,8,0.3,0.0,0.001,0.001,0.3,0.001,512,40
1,1,2.091425,1.446176,1.0,0.642157,0.642157,0.642157,5,5,0.0,0.0,0.001,0.001,0.3,0.001,128,40
2,2,0.624575,0.790301,0.978815,0.906701,0.953757,0.922668,5,8,0.3,0.0,0.001,0.001,0.0,0.001,128,40
3,3,0.521583,0.722207,1.0,0.933949,0.933949,0.933949,6,5,0.0,0.0,0.001,0.001,0.0,0.001,128,20


Test Results by Cluster:


Unnamed: 0,cluster_id,mse,rmse,ndcg@10,precision@10,recall@10,f1@10,custom_embedding_dim,random_embedding_dim,custom_dropout_rate,random_dropout_rate,custom_learning_rate,random_learning_rate,fc_dropout_rate,fc_learning_rate,batch_size,num_epochs
0,0,0.99622,0.998108,0.972035,0.885086,0.954904,0.908728,5.0,8.0,0.3,0.0,0.001,0.001,0.3,0.001,512.0,40.0
1,1,2.715702,1.647939,0.972531,0.617072,0.700048,0.644731,5.0,5.0,0.0,0.0,0.001,0.001,0.3,0.001,128.0,40.0
2,2,0.884973,0.94073,0.972299,0.896678,0.962527,0.920061,5.0,8.0,0.3,0.0,0.001,0.001,0.0,0.001,128.0,40.0
3,3,0.767207,0.875903,0.986772,0.933419,0.964502,0.94378,6.0,5.0,0.0,0.0,0.001,0.001,0.0,0.001,128.0,20.0
