# Neural Collaborative Filtering with Custom Embeddings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import time
import copy
import torch
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

# Set random seed
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/data'
project_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon'

# Step 1: Load dataset

Previously in step1_data_preprocessing.ipynb, We have split the df_reviews dataset into training, testing and validation samples for each user, following chronological order and using the early 70% of each user's interactions for training, followed by the next 15% for validation and the last 15% for testing.

Now we will load the train, val and test CSVs and the filtered_and_clustered review dataset with engineered features and cluster assignments.

In [None]:
train_data = pd.read_csv(os.path.join(data_dir,"train_data.csv"))
test_data = pd.read_csv(os.path.join(data_dir,"test_data.csv"))
val_data = pd.read_csv(os.path.join(data_dir,"val_data.csv"))

df_reviews = pd.read_csv(os.path.join(data_dir,"filtered_reviews_with_features_and_clusters.csv"))
print(df_reviews.head())

print(f"Training Data Shape: {train_data.shape}")
print(f"Testing Data Shape: {test_data.shape}")
print(f"Validation Data Shape: {val_data.shape}")

print(train_data.columns)
print(test_data.columns)

print(df_reviews['sentiments'].value_counts())


   customer_id  product_id  product_parent  \
0        11960  B00LCJAW06       219600481   
1        11960  B008OTSEXY       682436048   
2        11960  B00KJ15KGY        32170248   
3        11960  B008ZL49WQ       614364353   
4        11960  B002WRGE5O       928204157   

                                       product_title product_category  \
0  Persian-Rugs T1007 Abstract Modern Area Rug Ca...        Furniture   
1  Flash Furniture High Back Black Ribbed Upholst...        Furniture   
2  Jackson Pollock Inspired Coffee Glass Table w/...        Furniture   
3                                  Eaze Lounge Chair        Furniture   
4         Walker Edison L-Shaped Glass Computer Desk        Furniture   

   star_rating  helpful_votes  total_votes vine verified_purchase  ...  \
0            4              1            1    N                 Y  ...   
1            4              0            0    N                 Y  ...   
2            4              1            1    N               

# Filter out Customers and Products in test and val set that do not appear in training set

Prepares cluster-specific training, validation, and test data by:
- Extracting the subset of each dataframe corresponding to the given cluster ID.
- Mapping customer_id and product_id to 0-based index values within the cluster.
- Filtering val/test sets to only include users/items present in the cluster's training set.

Prevent Cold Start problems during validation and testing. If a customer or product appears only in the validation or test set appears only in the validataion or test set, the model has never seen it before and cannot generate a valid prediction. Thus we will remove any rows in the validation or test sets which does not belong to any user in training set or the product is absent in the training set.

We do not remove any rows from the training set.

In [None]:
def prepare_cluster_data(cluster_id, train_df, val_df=None, test_df=None):
    train_cluster = train_df[train_df['cluster'] == cluster_id].reset_index(drop=True)
    user2idx = {uid: idx for idx, uid in enumerate(train_cluster['customer_id'].unique())}
    item2idx = {pid: idx for idx, pid in enumerate(train_cluster['product_id'].unique())}

    train_cluster['user_idx'] = train_cluster['customer_id'].map(user2idx)
    train_cluster['item_idx'] = train_cluster['product_id'].map(item2idx)

    val_cluster, test_cluster = None, None

    if val_df is not None:
        val_cluster = val_df[val_df['cluster'] == cluster_id].reset_index(drop=True)
        val_cluster = val_cluster[
            val_cluster['customer_id'].isin(user2idx) &
            val_cluster['product_id'].isin(item2idx)
        ].reset_index(drop=True)
        val_cluster['user_idx'] = val_cluster['customer_id'].map(user2idx)
        val_cluster['item_idx'] = val_cluster['product_id'].map(item2idx)

    if test_df is not None:
        test_cluster = test_df[test_df['cluster'] == cluster_id].reset_index(drop=True)
        test_cluster = test_cluster[
            test_cluster['customer_id'].isin(user2idx) &
            test_cluster['product_id'].isin(item2idx)
        ].reset_index(drop=True)
        test_cluster['user_idx'] = test_cluster['customer_id'].map(user2idx)
        test_cluster['item_idx'] = test_cluster['product_id'].map(item2idx)

    return train_cluster, val_cluster, test_cluster, user2idx, item2idx

# Create Data Loader
Defines ReviewsDataset, a PyTorch Dataset class used for model training and evaluation. Each item returns user_idx, product_idx, and star_rating

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'customer_id': torch.tensor(row['user_idx'], dtype=torch.long),
            'product_id': torch.tensor(row['item_idx'], dtype=torch.long),
            'rating': torch.tensor(row['star_rating'], dtype=torch.float)
        }


# Building Customer Embeddings

The original **df_reviews** will be used to build the custom customer and product embeddings. These custom embeddings are meant to reflect historical behaviour or characteristics of customers/products

The customer embeddings will constructed by aggregating features within df_reviews by customer_id to find a customer's:
 - Purchase Frequency (Indicate how active a customer is)
 - Time Since Last Purchase (Indicate how active a customer is)
 - Average Star Rating (Overall Customer satisfaction across all of his purchases)
 - Total Vine Reviews (Measure of Credibility of his Reviews)
 - Total Helpful Votes (Measure the Credibility of his Reviews)
 - Total Votes (Measure the Credibility of his Reviews)
 - Average Sentiment (Overall Customer satisfaction across all of his purchases)

These embeddings are more informative than a randomly intialized embedding in typical recommnedation systems. The choice of specific features injects domain knowledge into the model.

Within the build_customer_embedding method, features are scaled and reduced via PCA to match the specified embedding_dim. They will also be cached (saved as .npy files) for reuse E.g. cust_emb_cluster_0_6.npy



In [None]:
def build_customer_embeddings(df, embedding_dim, cache_path):
    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
    if os.path.exists(cache_path):
        return np.load(cache_path)

    cust_features = df.groupby('customer_id').agg({
        'monthly_purchase_frequency': 'mean',
        'time_since_last_purchase': 'mean',
        'star_rating': 'mean',
        'vine': lambda x: (x == "Y").sum(),
        'helpful_votes': 'sum',
        'total_votes': 'sum',
        'sentiments': lambda x: (x == 'positive').mean()
    }).reset_index()

    X = cust_features.drop(columns=['customer_id'])
    X_scaled = StandardScaler().fit_transform(X)

    # Only apply PCA if embedding_dim < num_features
    num_features = X_scaled.shape[1]
    if embedding_dim < num_features:
        pca = PCA(n_components=embedding_dim)
        embeddings = pca.fit_transform(X_scaled)
    elif embedding_dim == num_features:
        embeddings = X_scaled
    else:
        raise ValueError(f"Requested embedding_dim={embedding_dim}, but only {num_features} features are available.")

    np.save(cache_path, embeddings)
    return embeddings


## Building Product Embeddings

The product embeddings will constructed by aggregating features within df_reviews by product_id to find a product's:
- Mean Star Rating that it received (Customer satisfaction)
- Total Helpful Votes given to all its reviews (Quality of customer feedback)
- Total Votes given to all its reviews (Review Engagement by customers)
- Average sentiment (1 is Positive and 0 is Negative)
- Total Number of Vine Reviews (Number of Credible Reviews)
- Total Product Sales (Demand for Product)

Within the build_product_embedding method, features are scaled and reduced via PCA to match the specified embedding_dim. They will also be cached (saved as .npy files) for reuse E.g. prod_emb_cluster_0_6.npy


In [None]:
def build_product_embeddings(df, embedding_dim, cache_path):
    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
    if os.path.exists(cache_path):
        return np.load(cache_path)

    prod_features = df.groupby('product_id').agg({
        'star_rating': 'mean',
        'helpful_votes': 'sum',
        'total_votes': 'sum',
        'sentiments': lambda x: (x == 'positive').mean(),
        'vine': lambda x: (x == 'Y').sum(),
        'product_id': 'count'
    }).rename(columns={'product_id': 'sales_volume'}).reset_index()

    X = prod_features.drop(columns=['product_id'])
    X_scaled = StandardScaler().fit_transform(X)

    num_features = X_scaled.shape[1]
    if embedding_dim < num_features:
        pca = PCA(n_components=embedding_dim)
        embeddings = pca.fit_transform(X_scaled)
    elif embedding_dim == num_features:
        embeddings = X_scaled
    else:
        raise ValueError(f"Requested embedding_dim={embedding_dim}, but only {num_features} features are available.")

    np.save(cache_path, embeddings)
    return embeddings


# Define the NCF model with GMF and MLP

NCF class implements a Neural Collaborative Filtering model combining:

- **GMF (Generalized Matrix Factorization)**: Element-wise product of user and item embeddings

- **MLP (Multi-Layer Perceptron)**: Concatenated embeddings passed through FC layers

- **Final prediction**: Merges GMF and MLP outputs to produce a predicted rating (1 to 5 scale)

## Aggregate the outputs of GMF and MLP by **concatenation**

In [None]:
class NCF(nn.Module):
    def __init__(self, cust_emb_gmf, prod_emb_gmf, cust_emb_mlp, prod_emb_mlp, embedding_dim, dropout_rate=0.3):
        super(NCF, self).__init__()
        # GMF Components
        self.customer_embeddings_gmf = nn.Embedding.from_pretrained(cust_emb_gmf, freeze=False)
        self.product_embeddings_gmf = nn.Embedding.from_pretrained(prod_emb_gmf, freeze=False)
        # MLP Components
        self.customer_embeddings_mlp = nn.Embedding.from_pretrained(cust_emb_mlp, freeze=False)
        self.product_embeddings_mlp = nn.Embedding.from_pretrained(prod_emb_mlp, freeze=False)

        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.bn1_mlp = nn.BatchNorm1d(128)
        self.dropout1_mlp = nn.Dropout(dropout_rate)

        self.fc2_mlp = nn.Linear(128, 64)
        self.bn2_mlp = nn.BatchNorm1d(64)
        self.dropout2_mlp = nn.Dropout(dropout_rate)
        # Final layers
        self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
        self.bn1_combined = nn.BatchNorm1d(128)
        self.dropout1_combined = nn.Dropout(dropout_rate)

        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, customer_id, product_id):
        #GMF
        cust_gmf = self.customer_embeddings_gmf(customer_id)
        prod_gmf = self.product_embeddings_gmf(product_id)
        gmf_output = cust_gmf * prod_gmf

        #MLP
        cust_mlp = self.customer_embeddings_mlp(customer_id)
        prod_mlp = self.product_embeddings_mlp(product_id)
        mlp_input = torch.cat([cust_mlp, prod_mlp], dim=-1)

        mlp_output = F.relu(self.bn1_mlp(self.fc1_mlp(mlp_input)))
        mlp_output = self.dropout1_mlp(mlp_output)

        mlp_output = F.relu(self.bn2_mlp(self.fc2_mlp(mlp_output)))
        mlp_output = self.dropout2_mlp(mlp_output)

        # Combine GMF and MLP
        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = F.relu(self.bn1_combined(self.fc1_combined(combined_input)))
        combined_output = self.dropout1_combined(combined_output)

        # Final layer & output scaling (1–5 range)
        output = self.fc2_combined(combined_output)
        return output.squeeze() * 4 + 1


# Evaluation Functions

- **ndcg_at_k**: Computes the Normalized Discounted Cumulative Gain (NDCG) at rank k for a single list of relevance. If the list contains fewer than k items, it will use actual_k = min(k, len(relevances)) to ensure fair computation.

- **mean_ndcg_user_at_k**: Computes the mean NDCG@k across all users by grouping predicted scores and relevance labels per user, sorting by prediction, and applying ndcg_at_k. For each user, their items are sorted by predicted scores, and NDCG is computed using `ndcg_at_k` with actual_k = min(k, len(user_items)).

- **mean_precision_user_at_k**: Computes the mean Precision@k across all users.
Precision@k is the proportion of relevant items (e.g., rating ≥ threshold) among the top-k predicted items for each user. For each user, top-k items are selected based on predicted scores. If the user has fewer than k items, actual_k = min(k, len(user_items)) is used.  
  Precision is calculated as:  
  `precision = (# of relevant items among top-k) / actual_k`  
  where an item is considered relevant if `rating ≥ threshold`.

- **mean_recall_user_at_k**: Computes the mean Recall@k across all users.
Recall@k is the proportion of a user's relevant items (rating ≥ threshold) that are retrieved in the top-k predicted list. For each user, top-k items are selected based on predicted scores, and recall is calculated as:  
  `recall = (# of relevant items among top-k) / total number of relevant items for the user`  
  actual_k = min(k, len(user_items)) is used to handle users with fewer than k items.

- **mean_f1_user_at_k**:  
  Computes the mean F1@k across all users, where F1 combines precision and recall.  
  For each user, top-k items are selected (using actual_k = min(k, len(user_items))), and F1 is calculated based on binarized relevance labels (`rating ≥ threshold`).  
  The predicted labels are assumed to be all 1s (e.g top-k are predicted as relevant).

In [None]:
def ndcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=np.float64)
    actual_k = min(k, len(relevances))
    if actual_k == 0:
        return 0.0
    relevances = relevances[:actual_k]
    dcg = np.sum((2 ** relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum((2 ** ideal_relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def mean_ndcg_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for u, pred, rel in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, rel))
    ndcg_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        relevances = [rel for _, rel in entries_sorted]
        ndcg_list.append(ndcg_at_k(relevances, k))
    return np.mean(ndcg_list) if ndcg_list else 0.0

def mean_precision_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    precision_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]
        rels = [1 if r >= threshold else 0 for _, r in top_k]
        precision_list.append(np.sum(rels) / actual_k if actual_k > 0 else 0)
    return np.mean(precision_list) if precision_list else 0.0

def mean_recall_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    recall_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]

        all_rels = [1 if r >= threshold else 0 for _, r in entries]
        top_k_rels = [1 if r >= threshold else 0 for _, r in top_k]
        total_relevant = np.sum(all_rels)

        if total_relevant == 0:
            recall = 0.0
        else:
            recall = np.sum(top_k_rels) / total_relevant
        recall_list.append(recall)
    return np.mean(recall_list) if recall_list else 0.0

def mean_f1_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, p, l in zip(all_users, all_preds, all_labels):
        user_data[u].append((p, l))

    f1_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        y_true = [int(l >= threshold) for _, l in entries_sorted[:actual_k]]
        y_pred = [1] * actual_k
        f1_list.append(f1_score(y_true, y_pred, zero_division=0))
    return np.mean(f1_list) if f1_list else 0.0

# Grid Search

The grid search algorithm here will perform an exhaustive search to identify the best combination of hyperparamters (embedding_dim, learning_rate, batch_size, dropout_rate, num_epoches) for training the NCF model.

For each configuration:

1. Custom embeddings for users and products are generated using PCA on the training data based on the current embedding_dim.

2. A new NCF model is instantiated with the configuration parameters.

3. The model is trained on the training set and evaluated on the validation set.

4. The best model state (with lowest validation loss) is stored using early stopping.

5. The configuration and model weights are saved if it performs better than all previous configurations.

It will then report the best-performing configuratuon which we will use to train the final model on the combined training and validation data before evaluating it on our test data.


**Embeddings**

During grid search, both customer_embedding and prod_embedding is build using the train_data. The embedding_dim is changing in each iteration of grid search and PCA must be redone with each new embedding dimension.

Note: The maximum embedding_dim for Grid Search is limited by the number of features that I used to form the customer_embedding and product_embedding respectively.


**run_grid_search()**

- Loops through all hyperparameter combinations (embedding_dim, learning_rate, batch_size, dropout_rate, num_epochs)

- Trains cluster model using train_data and evaluates using only val_data

- Logs each config + cluster result to grid_search_log.csv

- Returns best config (lowest avg MSE)


In [None]:
def train_and_evaluate_cluster(
    cluster_id, train_data, val_data, df_reviews, config, cache_dir, model_dir
):
    import torch.optim as optim
    from torch.utils.data import DataLoader

    train_cluster, val_cluster, _, user2idx, item2idx = prepare_cluster_data(
        cluster_id, train_data, val_data, test_df=None
    )
    cust_cache_path = os.path.join(cache_dir, f"cust_emb_cluster_{cluster_id}_{config['embedding_dim']}_train.npy")
    prod_cache_path = os.path.join(cache_dir, f"prod_emb_cluster_{cluster_id}_{config['embedding_dim']}_train.npy")

    cust_emb = torch.tensor(build_customer_embeddings(train_data,config['embedding_dim'], cust_cache_path), dtype=torch.float32)
    prod_emb = torch.tensor(build_product_embeddings(train_data,config['embedding_dim'], prod_cache_path), dtype=torch.float32)

    model = NCF(cust_emb, prod_emb, cust_emb.clone(), prod_emb.clone(),
                embedding_dim=config['embedding_dim'],
                dropout_rate=config['dropout_rate']).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])

    train_loader = DataLoader(ReviewsDataset(train_cluster), batch_size=config['batch_size'], shuffle=True, num_workers=2)
    val_loader = DataLoader(ReviewsDataset(val_cluster), batch_size=config['batch_size'], num_workers=2)

    best_val_loss = float('inf')
    best_model_state = None
    patience_counter = 0
    best_val_metrics = {}

    for epoch in range(config['num_epochs']):
        model.train()
        for batch in train_loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            label = batch['rating'].to(device)

            optimizer.zero_grad()
            output = model(user, item)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_preds, val_labels, val_users = [], [], []
        with torch.no_grad():
            for batch in val_loader:
                user = batch['customer_id'].to(device)
                item = batch['product_id'].to(device)
                label = batch['rating'].to(device)
                output = model(user, item).squeeze()

                val_preds.extend(output.cpu().numpy())
                val_labels.extend(label.cpu().numpy())
                val_users.extend(user.cpu().numpy())

        mse_val = np.mean((np.array(val_preds) - np.array(val_labels))**2)
        rmse_val = np.sqrt(mse_val)
        ndcg_val = mean_ndcg_user_at_k(val_users, val_preds, val_labels, k=10)
        precision_val = mean_precision_user_at_k(val_users, val_preds, val_labels, k=10)
        recall_val = mean_recall_user_at_k(val_users, val_preds, val_labels, k=10)
        f1_val = mean_f1_user_at_k(val_users, val_preds, val_labels, k=10)

        if mse_val < best_val_loss:
            best_val_loss = mse_val
            best_model_state = copy.deepcopy(model.state_dict())
            patience_counter = 0
            best_val_metrics = {
                'cluster_id': cluster_id,
                'mse': mse_val,
                'rmse': rmse_val,
                'ndcg@10': ndcg_val,
                'precision@10': precision_val,
                'recall@10': recall_val,
                'f1@10': f1_val,
                **config
            }
        else:
            patience_counter += 1
            if patience_counter >= config.get('patience', 5):
                break

    weight_path = os.path.join(model_dir, f"model_cluster_{cluster_id}.pt")
    torch.save(best_model_state, weight_path)

    return best_val_metrics

#Run grid searhc over all clusters
def run_grid_search(train_data, val_data, df_reviews, param_grid, cache_dir, model_dir, log_path):
    import itertools
    import csv
    import time

    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    param_combos = list(itertools.product(*param_grid.values()))
    param_names = list(param_grid.keys())

    best_config = None
    best_cluster_val_results = None
    best_avg_mse = float('inf')

    log_file = open(log_path, mode='w', newline='')
    logger = csv.DictWriter(log_file, fieldnames=["cluster_id"] + param_names + ["mse", "rmse", "ndcg@10", "precision@10", "recall@10", "f1@10", "train_time_sec"])
    logger.writeheader()

    for values in param_combos:
        config = dict(zip(param_names, values))
        all_metrics = []
        total_time = 0
        print(f"Running config: {config}")

        for cluster_id in sorted(train_data['cluster'].unique()):
            start_time = time.time()
            metrics = train_and_evaluate_cluster(cluster_id, train_data, val_data, df_reviews, config, cache_dir, model_dir)
            duration = time.time() - start_time
            metrics['train_time_sec'] = round(duration, 2)
            logger.writerow(metrics)
            all_metrics.append(metrics)
            total_time += duration

        avg_mse = np.mean([m['mse'] for m in all_metrics])
        if avg_mse < best_avg_mse:
            best_avg_mse = avg_mse
            best_config = config
            best_cluster_val_results = all_metrics

    log_file.close()
    return best_config, best_cluster_val_results

### Final Model Training

We will retrain the NCF model using the optimal hyperparameters identified through Grid Search, this time on the combined training and validation data. Lastly, the model is then evaluated on the test data.

**Embeddings**

The final model will be intialized using customer and product embeddings that are forming using the (train_data + val_data) as we want our final model to have the most informed embeddings for evaluation on the test set.


**final_training_all_clusters()**
- Trains NCF on final_train = train_data + val_data for each cluster
- Use best config from grid search
- Saves final test results to "test_results_by_cluster.csv"
- Caches final embeddings and save final model weights


In [None]:
def final_training_all_clusters(final_train, test_data, df_reviews, config, cache_dir, model_dir, result_dir):
    os.makedirs(result_dir, exist_ok=True)
    val_records = []
    test_records = []

    for cluster_id in sorted(final_train['cluster'].unique()):
        print(f"Retraining final model for cluster {cluster_id}...")

        train_cluster, _, test_cluster, user2idx, item2idx = prepare_cluster_data(cluster_id, final_train, val_df=None, test_df=test_data)

        cust_cache_path = os.path.join(cache_dir, f"cust_emb_cluster_{cluster_id}_final.npy")
        prod_cache_path = os.path.join(cache_dir, f"prod_emb_cluster_{cluster_id}_final.npy")

        cust_emb = torch.tensor(
            build_customer_embeddings(final_train,
                                       config['embedding_dim'], cust_cache_path), dtype=torch.float32
        )
        prod_emb = torch.tensor(
            build_product_embeddings(final_train,
                                      config['embedding_dim'], prod_cache_path), dtype=torch.float32
        )


        model = NCF(cust_emb, prod_emb, cust_emb.clone(), prod_emb.clone(),
                    embedding_dim = config['embedding_dim'], dropout_rate=config['dropout_rate']).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
        criterion = nn.MSELoss()

        train_loader = DataLoader(ReviewsDataset(train_cluster), batch_size=config['batch_size'], shuffle=True)
        for epoch in range(config['num_epochs']):
            model.train()
            for batch in train_loader:
                user = batch['customer_id'].to(device)
                item = batch['product_id'].to(device)
                rating = batch['rating'].to(device)
                optimizer.zero_grad()
                preds = model(user, item)
                loss = criterion(preds, rating)
                loss.backward()
                optimizer.step()

        torch.save(model.state_dict(), os.path.join(model_dir, f"model_cluster_{cluster_id}.pt"))

        test_loader = DataLoader(ReviewsDataset(test_cluster), batch_size=256)
        model.eval()
        preds, labels, users = [], [], []
        with torch.no_grad():
            for batch in test_loader:
                user = batch['customer_id'].to(device)
                item = batch['product_id'].to(device)
                rating = batch['rating'].to(device)
                output = model(user, item).squeeze()
                preds.extend(output.cpu().numpy())
                labels.extend(rating.cpu().numpy())
                users.extend(user.cpu().numpy())

        mse = np.mean((np.array(preds) - np.array(labels)) ** 2)
        rmse = np.sqrt(mse)
        ndcg = mean_ndcg_user_at_k(users, preds, labels, k=10)
        precision = mean_precision_user_at_k(users, preds, labels, k=10)
        recall = mean_recall_user_at_k(users, preds, labels, k=10)
        f1 = mean_f1_user_at_k(users, preds, labels, k=10)

        record = {
            'cluster_id': cluster_id, 'mse': mse, 'rmse': rmse,
            'ndcg@10': ndcg, 'precision@10': precision, 'recall@10': recall, 'f1@10': f1,
            **config
        }
        test_records.append(record)

    pd.DataFrame(test_records).to_csv(os.path.join(result_dir, 'test_results_by_cluster.csv'), index=False)
    print("Finished retraining and evaluation on final dataset.")


# Evaluation

The model is evaluated on the test data.

In [None]:
def run_full_clustered_ncf_pipeline(train_data, val_data, test_data, df_reviews, param_grid,
                                    cache_dir, model_dir, result_dir, log_path):
    print("\n======================= GRID SEARCH =======================")
    best_config, val_results = run_grid_search(
        train_data, val_data, df_reviews,
        param_grid, cache_dir, model_dir, log_path
    )

    # Save validation results
    val_df = pd.DataFrame(val_results)
    val_path = os.path.join(result_dir, 'val_results_by_cluster.csv')
    os.makedirs(result_dir, exist_ok=True)
    val_df.to_csv(val_path, index=False)
    print(f"Saved best validation metrics to {val_path}")

    print("\n================= FINAL MODEL TRAINING =================")
    final_train = pd.concat([train_data, val_data]).reset_index(drop=True)
    final_training_all_clusters(final_train, test_data, df_reviews,
                                best_config, cache_dir, model_dir, result_dir)

    print("\nPipeline complete.")

In [None]:
param_grid = {                           # Total Combinations:
        'embedding_dim': [5,6],          # keep both for low vs high capacity
        'learning_rate': [0.001],           # pick one reliable value
        'batch_size': [128, 512],           # small vs large batch
        'dropout_rate': [0.0, 0.3],         # low vs regular dropout
        'num_epochs': [20, 40]              # moderate vs longer training
    }

# param_grid = {
#     'embedding_dim': [6],
#     'learning_rate': [0.001],
#     'batch_size': [128],
#     'dropout_rate': [0.0],
#     'num_epochs': [20]
# }

run_full_clustered_ncf_pipeline(
    train_data=train_data,
    val_data=val_data,
    test_data=test_data,
    df_reviews=df_reviews,
    param_grid=param_grid,
    cache_dir= project_dir + "/" + "Model Results/NCF Custom Embedding/Clustered Model/cache",
    model_dir= project_dir  + "/" + "Model Results/NCF Custom Embedding/Clustered Model/weights",
    result_dir= project_dir + "/" + "Model Results/NCF Custom Embedding/Clustered Model",
    log_path= project_dir  + "/" + "Model Results/NCF Custom Embedding/Clustered Model/grid_search_log.csv"
)



Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.0, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.0, 'num_epochs': 40}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.3, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.3, 'num_epochs': 40}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.0, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.0, 'num_epochs': 40}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.3, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.3, 'num_epochs': 40}
Running config: {'embedding_dim': 6, 'learning_

# Check Stored Validation and Testing Results

In [None]:
# step5_1_2 - NCF Model: Custom Embedding By Cluster
results_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results'

cust_emb_by_cluster_val_results = pd.read_csv(os.path.join(results_dir,"NCF Custom Embedding/Clustered Model/val_results_by_cluster.csv"))
print(cust_emb_by_cluster_val_results.shape)
print("==========step5_1_2 - NCF Model: Custom Embedding By Cluster Validation Results==========")
display(cust_emb_by_cluster_val_results.head())


cust_emb_by_cluster_test_results = pd.read_csv(os.path.join(results_dir,"NCF Custom Embedding/Clustered Model/test_results_by_cluster.csv"))
print(cust_emb_by_cluster_test_results.shape)
print("==========step5_1_2 - NCF Model: Custom Embedding By Cluster Test Results==========")
display(cust_emb_by_cluster_test_results.head())

print("Average Validation Metrics:")
print(cust_emb_by_cluster_val_results.mean(numeric_only=True))

# Compute average metrics across clusters for test results
print("\nAverage Test Metrics:")
print(cust_emb_by_cluster_test_results.mean(numeric_only=True))

(4, 13)


Unnamed: 0,cluster_id,mse,rmse,ndcg@10,precision@10,recall@10,f1@10,embedding_dim,learning_rate,batch_size,dropout_rate,num_epochs,train_time_sec
0,0,0.817287,0.904039,0.996735,0.888622,0.896832,0.891358,5,0.001,512,0.0,40,195.19
1,1,2.007905,1.417006,1.0,0.642157,0.642157,0.642157,5,0.001,512,0.0,40,60.92
2,2,0.695792,0.834141,0.974441,0.906701,0.953757,0.922668,5,0.001,512,0.0,40,57.76
3,3,0.550171,0.741735,1.0,0.933949,0.933949,0.933949,5,0.001,512,0.0,40,138.14


(4, 12)


Unnamed: 0,cluster_id,mse,rmse,ndcg@10,precision@10,recall@10,f1@10,embedding_dim,learning_rate,batch_size,dropout_rate,num_epochs
0,0,1.004079,1.002037,0.972411,0.885086,0.954904,0.908728,5,0.001,512,0.0,40
1,1,3.231911,1.797752,0.973492,0.617072,0.700048,0.644731,5,0.001,512,0.0,40
2,2,1.263324,1.123977,0.970354,0.896678,0.962527,0.920061,5,0.001,512,0.0,40
3,3,1.100766,1.049174,0.986349,0.933419,0.964502,0.94378,5,0.001,512,0.0,40


Average Validation Metrics:
cluster_id          1.500000
mse                 1.017789
rmse                0.974230
ndcg@10             0.992794
precision@10        0.842857
recall@10           0.856674
f1@10               0.847533
embedding_dim       5.000000
learning_rate       0.001000
batch_size        512.000000
dropout_rate        0.000000
num_epochs         40.000000
train_time_sec    113.002500
dtype: float64

Average Test Metrics:
cluster_id         1.500000
mse                1.650020
rmse               1.243235
ndcg@10            0.975652
precision@10       0.833064
recall@10          0.895495
f1@10              0.854325
embedding_dim      5.000000
learning_rate      0.001000
batch_size       512.000000
dropout_rate       0.000000
num_epochs        40.000000
dtype: float64
