# Neural Collaborative Filtering with Randomly Intialized Embeddings

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import copy
import os
from collections import defaultdict
import torch.nn.functional as F
from sklearn.metrics import f1_score

data_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/data'
project_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon'

# Step 1: Load dataset

Previously in step1_data_preprocessing.ipynb, We have split the df_reviews dataset into training, testing and validation samples for each user, following chronological order and using the early 70% of each user's interactions for training, followed by the next 15% for validation and the last 15% for testing.

In [3]:
train_data = pd.read_csv(os.path.join(data_dir,"train_data.csv"))
test_data = pd.read_csv(os.path.join(data_dir,"test_data.csv"))
val_data = pd.read_csv(os.path.join(data_dir,"val_data.csv"))

df_reviews = pd.read_csv(os.path.join(data_dir,"filtered_reviews_with_features_and_clusters.csv"))
print(df_reviews.head())

   customer_id  product_id  product_parent  \
0        11960  B00LCJAW06       219600481   
1        11960  B008OTSEXY       682436048   
2        11960  B00KJ15KGY        32170248   
3        11960  B008ZL49WQ       614364353   
4        11960  B002WRGE5O       928204157   

                                       product_title product_category  \
0  Persian-Rugs T1007 Abstract Modern Area Rug Ca...        Furniture   
1  Flash Furniture High Back Black Ribbed Upholst...        Furniture   
2  Jackson Pollock Inspired Coffee Glass Table w/...        Furniture   
3                                  Eaze Lounge Chair        Furniture   
4         Walker Edison L-Shaped Glass Computer Desk        Furniture   

   star_rating  helpful_votes  total_votes vine verified_purchase  ...  \
0            4              1            1    N                 Y  ...   
1            4              0            0    N                 Y  ...   
2            4              1            1    N               

# Filter out Customers and Products in test and val set that do not appear in training set

Prevent Cold Start problems during validation and testing. If a customer or product appears only in the validation or test set appears only in the validataion or test set, the model has never seen it before and cannot generate a valid prediction. Thus we will remove any rows in the validation or test sets which does not belong to any user in training set or the product is absent in the training set.

We do not remove the rows from the training set.

In [4]:
unique_customers_train = set(train_data['customer_id'].unique())
unique_products_train = set(train_data['product_id'].unique())

val_data = val_data[val_data['customer_id'].isin(unique_customers_train) &
                    val_data['product_id'].isin(unique_products_train)].reset_index(drop=True)

test_data = test_data[test_data['customer_id'].isin(unique_customers_train) &
                      test_data['product_id'].isin(unique_products_train)].reset_index(drop=True)

print(val_data.shape)
print(test_data.shape)

(17174, 26)
(32880, 26)


# Create Data Loader

In [5]:
# Create ID to index mappings
user2idx = {user_id: idx for idx, user_id in enumerate(train_data['customer_id'].unique())}
item2idx = {item_id: idx for idx, item_id in enumerate(train_data['product_id'].unique())}

# Map to new columns
train_data['user_idx'] = train_data['customer_id'].map(user2idx)
train_data['item_idx'] = train_data['product_id'].map(item2idx)
val_data['user_idx'] = val_data['customer_id'].map(user2idx)
val_data['item_idx'] = val_data['product_id'].map(item2idx)
test_data['user_idx'] = test_data['customer_id'].map(user2idx)
test_data['item_idx'] = test_data['product_id'].map(item2idx)


class ReviewsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'customer_id': torch.tensor(row['user_idx'], dtype=torch.long),
            'product_id': torch.tensor(row['item_idx'], dtype=torch.long),
            'rating': torch.tensor(row['star_rating'], dtype=torch.float)
        }

# Evaluation Functions

- **ndcg_at_k**: Computes the Normalized Discounted Cumulative Gain (NDCG) at rank k for a single list of relevance. If the list contains fewer than k items, it will use actual_k = min(k, len(relevances)) to ensure fair computation.

- **mean_ndcg_user_at_k**: Computes the mean NDCG@k across all users by grouping predicted scores and relevance labels per user, sorting by prediction, and applying ndcg_at_k. For each user, their items are sorted by predicted scores, and NDCG is computed using `ndcg_at_k` with actual_k = min(k, len(user_items)).

- **mean_precision_user_at_k**: Computes the mean Precision@k across all users.
Precision@k is the proportion of relevant items (e.g., rating ≥ threshold) among the top-k predicted items for each user. For each user, top-k items are selected based on predicted scores. If the user has fewer than k items, actual_k = min(k, len(user_items)) is used.  
  Precision is calculated as:  
  `precision = (# of relevant items among top-k) / actual_k`  
  where an item is considered relevant if `rating ≥ threshold`.

- **mean_recall_user_at_k**: Computes the mean Recall@k across all users.
Recall@k is the proportion of a user's relevant items (rating ≥ threshold) that are retrieved in the top-k predicted list. For each user, top-k items are selected based on predicted scores, and recall is calculated as:  
  `recall = (# of relevant items among top-k) / total number of relevant items for the user`  
  actual_k = min(k, len(user_items)) is used to handle users with fewer than k items.

- **mean_f1_user_at_k**:  
  Computes the mean F1@k across all users, where F1 combines precision and recall.  
  For each user, top-k items are selected (using actual_k = min(k, len(user_items))), and F1 is calculated based on binarized relevance labels (`rating ≥ threshold`).  
  The predicted labels are assumed to be all 1s (e.g top-k are predicted as relevant).

In [6]:
def ndcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=np.float64)
    actual_k = min(k, len(relevances))
    if actual_k == 0:
        return 0.0
    relevances = relevances[:actual_k]
    dcg = np.sum((2 ** relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum((2 ** ideal_relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def mean_ndcg_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for u, pred, rel in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, rel))
    ndcg_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        relevances = [rel for _, rel in entries_sorted]
        ndcg_list.append(ndcg_at_k(relevances, k))
    return np.mean(ndcg_list) if ndcg_list else 0.0

def mean_precision_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    precision_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]
        rels = [1 if r >= threshold else 0 for _, r in top_k]
        precision_list.append(np.sum(rels) / actual_k if actual_k > 0 else 0)
    return np.mean(precision_list) if precision_list else 0.0

def mean_recall_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    recall_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]

        all_rels = [1 if r >= threshold else 0 for _, r in entries]
        top_k_rels = [1 if r >= threshold else 0 for _, r in top_k]
        total_relevant = np.sum(all_rels)

        if total_relevant == 0:
            recall = 0.0
        else:
            recall = np.sum(top_k_rels) / total_relevant
        recall_list.append(recall)
    return np.mean(recall_list) if recall_list else 0.0

def mean_f1_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, p, l in zip(all_users, all_preds, all_labels):
        user_data[u].append((p, l))

    f1_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        y_true = [int(l >= threshold) for _, l in entries_sorted[:actual_k]]
        y_pred = [1] * actual_k
        f1_list.append(f1_score(y_true, y_pred, zero_division=0))
    return np.mean(f1_list) if f1_list else 0.0

# Define the NCF model with GMF and MLP

NCF class implements a Neural Collaborative Filtering model combining:

- **GMF (Generalized Matrix Factorization)**: Element-wise product of user and item embeddings

- **MLP (Multi-Layer Perceptron)**: Concatenated embeddings passed through FC layers

- **Final prediction**: Merges GMF and MLP outputs to produce a predicted rating (1 to 5 scale)

## Aggregate the outputs of GMF and MLP by **concatenation**

In [7]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, dropout_rate=0.3):
        super(NCF, self).__init__()
        # Randomly initialized embedding layers
        self.customer_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.product_embeddings_gmf = nn.Embedding(num_items, embedding_dim)

        self.customer_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.product_embeddings_mlp = nn.Embedding(num_items, embedding_dim)

        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.bn1_mlp = nn.BatchNorm1d(128)
        self.dropout1_mlp = nn.Dropout(dropout_rate)

        self.fc2_mlp = nn.Linear(128, 64)
        self.bn2_mlp = nn.BatchNorm1d(64)
        self.dropout2_mlp = nn.Dropout(dropout_rate)

        self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
        self.bn1_combined = nn.BatchNorm1d(128)
        self.dropout1_combined = nn.Dropout(dropout_rate)

        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, customer_id, product_id):
        customer_emb_gmf = self.customer_embeddings_gmf(customer_id)
        product_emb_gmf = self.product_embeddings_gmf(product_id)
        gmf_output = customer_emb_gmf * product_emb_gmf

        customer_emb_mlp = self.customer_embeddings_mlp(customer_id)
        product_emb_mlp = self.product_embeddings_mlp(product_id)
        mlp_input = torch.cat([customer_emb_mlp, product_emb_mlp], dim=-1)

        mlp_output = F.relu(self.bn1_mlp(self.fc1_mlp(mlp_input)))
        mlp_output = self.dropout1_mlp(mlp_output)
        mlp_output = F.relu(self.bn2_mlp(self.fc2_mlp(mlp_output)))
        mlp_output = self.dropout2_mlp(mlp_output)

        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = F.relu(self.bn1_combined(self.fc1_combined(combined_input)))
        combined_output = self.dropout1_combined(combined_output)

        output = self.fc2_combined(combined_output)
        return output.squeeze() * 4 + 1

# Model Training




### Grid Search

The grid search algorithm here will perform an exhaustive search to identify the best combination of hyperparameters (embedding_dim, learning_rate, batch_size, dropout_rate, num_epoches) for training the NCF model.

For each configuration:

1. A new NCF model is instantiated with the configuration parameters.

2. The model is trained on the training set and evaluated on the validation set.

3. The best model state (with lowest validation loss) is stored using early stopping.

4. The configuration and model weights are saved if it performs better than all previous configurations.

It will then report the best-performing configuration which we will use to train the final model on the combined training and validation data before evaluating it on our test data.



# NCF Model Trained on Full Dataset
The functions that we will be using are:
1. **train_full_model** : Trains one NCF model on the full dataset. It returns the trained model, best validation loss and the best model state

2. **grid_search_full_data** : Performs grid search on the full dataset and returns the best hyperparameter configuration and the best model state.

3. **evaluate_model**: Evaluates a trained NCF model on the test set.




In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_full_model(train_df, val_df, config, model_state=None):
    loader = lambda df: DataLoader(ReviewsDataset(df), batch_size=config['batch_size'], shuffle=True)
    train_loader, val_loader = loader(train_df), loader(val_df)
    num_users = train_df['customer_id'].nunique()
    num_items = train_df['product_id'].nunique()
    model = NCF(num_users, num_items, config['embedding_dim'], config['dropout_rate']).to(device)
    if model_state: model.load_state_dict(model_state)
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    criterion = nn.MSELoss()

    best_loss, best_state, patience_counter = float('inf'), None, 0
    metrics = {}
    for epoch in range(config['num_epochs']):
        model.train()
        for batch in train_loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            label = batch['rating'].to(device)

            optimizer.zero_grad()
            preds = model(user, item)
            loss = criterion(preds, label)
            loss.backward()
            optimizer.step()
        model.eval()
        val_loss = 0
        with torch.no_grad():
            val_all_preds = []
            val_all_labels = []
            val_all_users = []
            for batch in val_loader:
                user = batch['customer_id'].to(device)
                item = batch['product_id'].to(device)
                label = batch['rating'].to(device)
                preds = model(user, item)
                output = preds.squeeze().cpu().numpy()
                val_loss += criterion(preds, label).item()
                val_all_preds.extend(output)
                val_all_labels.extend(label.cpu().numpy())
                val_all_users.extend(user.cpu().numpy())

        val_all_preds = np.array(val_all_preds)
        val_all_labels = np.array(val_all_labels)
        val_all_users = np.array(val_all_users)

        rmse_val = np.sqrt(np.mean((val_all_preds - val_all_labels)**2))
        ndcg_val = mean_ndcg_user_at_k(val_all_users, val_all_preds, val_all_labels, k=10)
        precision_val = mean_precision_user_at_k(val_all_users, val_all_preds, val_all_labels, k=10, threshold=4)
        recall_val = mean_recall_user_at_k(val_all_users, val_all_preds, val_all_labels, k=10, threshold=4)
        f1_val = mean_f1_user_at_k(val_all_users, val_all_preds, val_all_labels, k=10, threshold=4)

        val_loss /= len(val_loader)
        if val_loss < best_loss:
            best_loss, best_state, patience_counter = val_loss, copy.deepcopy(model.state_dict()), 0
            metrics = {
                'rmse': rmse_val,
                'mse': val_loss,
                'ndcg@10': ndcg_val,
                'precision@10': precision_val,
                'recall@10': recall_val,
                'f1@10': f1_val
            }
        else:
            patience_counter += 1
            if patience_counter >= config.get('patience', 5):
              break
    model.load_state_dict(best_state)
    return model, best_state, metrics

def grid_search_full_data(train_data, val_data, param_grid):
    combos = list(itertools.product(*param_grid.values()))
    print("======================= GRID SEARCH =======================")
    best_config, best_loss, best_state, best_metrics = None, float('inf'), None, {}
    for vals in combos:
        config = dict(zip(param_grid.keys(), vals))
        print("Running Config:",config)
        model, state, metrics = train_full_model(train_data, val_data, config)
        val_loss = metrics['mse']
        if val_loss < best_loss:
            best_config, best_loss, best_state, best_metrics = config, val_loss, state, metrics
    return best_config, best_state, best_metrics

def evaluate_model(model, test_data, batch_size=512):
    model.eval()
    loader = DataLoader(ReviewsDataset(test_data), batch_size=batch_size)
    criterion = nn.MSELoss()
    test_loss = 0

    test_all_preds = []
    test_all_labels = []
    test_all_users = []

    with torch.no_grad():
        for batch in loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            label = batch['rating'].to(device)
            preds = model(user, item)
            loss = criterion(preds, label)
            test_loss += loss.item()

            test_all_preds.extend(preds.squeeze().cpu().numpy())
            test_all_labels.extend(label.cpu().numpy())
            test_all_users.extend(user.cpu().numpy())
    test_loss /= len(loader)
    # Convert lists to numpy arrays
    test_all_preds = np.array(test_all_preds)
    test_all_labels = np.array(test_all_labels)
    test_all_users = np.array(test_all_users)

    # Compute evaluation metrics
    rmse_test = np.sqrt(np.mean((test_all_preds - test_all_labels)**2))
    ndcg_test = mean_ndcg_user_at_k(test_all_users, test_all_preds, test_all_labels, k=10)
    precision_test = mean_precision_user_at_k(test_all_users, test_all_preds, test_all_labels, k=10, threshold=4)
    recall_test = mean_recall_user_at_k(test_all_users, test_all_preds, test_all_labels, k=10, threshold=4)
    f1_test = mean_f1_user_at_k(test_all_users, test_all_preds, test_all_labels, k=10, threshold=4)

    test_metrics = {
        'rmse': rmse_test,
        'mse': test_loss,
        'ndcg@10': ndcg_test,
        'precision@10': precision_test,
        'recall@10': recall_test,
        'f1@10': f1_test
    }

    return test_metrics

### Final Model Training

We will retrain the NCF model using the optimal hyperparameters identified through Grid Search, this time on the combined training and validation data. Lastly, the model is then evaluated on the test data.

In [9]:
# Define hyperparameter grid
param_grid = {
    'embedding_dim': [16, 64],          # keep both for low vs high capacity
    'learning_rate': [0.001],           # pick one reliable value
    'batch_size': [128, 512],           # small vs large batch
    'dropout_rate': [0.0, 0.3],         # low vs regular dropout
    'num_epochs': [20, 40]              # moderate vs longer training
}

def run_final_pipeline_full_data(train_data, val_data, test_data, param_grid):
    output_path = os.path.join(project_dir, "Model Results/NCF Random Embedding/Full Model/results_full_dataset.txt")
    with open(output_path, "w") as f:
        best_config, best_state, best_metrics = grid_search_full_data(train_data, val_data, param_grid)

        print("======Validation Metrics For Full Dataset======\n")
        print(f"MSE: {best_metrics['mse']:.4f}, RMSE: {best_metrics['rmse']:.4f}, NDCG@10: {best_metrics['ndcg@10']}, Precision@10: {best_metrics['precision@10']}, Recall@10: {best_metrics['recall@10']}, F1@10: {best_metrics['f1@10']}\n")
        eval_results = best_metrics
        eval_results.update(best_config)
        df_val_results = pd.DataFrame([eval_results])
        df_val_results.to_csv(os.path.join(project_dir,"Model Results/NCF Random Embedding/Full Model/val_results_full_dataset.csv"), index=False)

        train_val_data = pd.concat([train_data, val_data]).reset_index(drop=True)
        final_model, _, _ = train_full_model(train_val_data, val_data, best_config, best_state)
        test_metrics = evaluate_model(final_model, test_data)

        test_results = {}
        test_results.update(test_metrics)
        test_results.update(best_config)
        df_test_results = pd.DataFrame([test_results])
        df_test_results.to_csv(os.path.join(project_dir,"Model Results/NCF Random Embedding/Full Model/test_results_full_dataset.csv"), index=False)

        print("===== Test Results For Full Dataset======\n")
        print(f"MSE: {test_metrics['mse']:.4f}, RMSE: {test_metrics['rmse']:.4f}, NDCG@10: {test_metrics['ndcg@10']:.4f}, Precision@10: {test_metrics['precision@10']:.4f}, Recall@10: {test_metrics['recall@10']:.4f}, F1@10: {test_metrics['f1@10']}")


        f.write(f"======Validation Metrics For Full Dataset======\n")
        f.write(f"MSE: {best_metrics['mse']:.4f}, RMSE: {best_metrics['rmse']:.4f}, NDCG@10: {best_metrics['ndcg@10']}, Precision@10: {best_metrics['precision@10']}, Recall@10: {best_metrics['recall@10']}, F1@10: {best_metrics['f1@10']}\n")
        f.write(f"===== Test Results For Full Dataset======\n")
        f.write(f"Full Dataset Test:  MSE: {test_metrics['mse']:.4f}, RMSE: {test_metrics['rmse']:.4f}, NDCG@10: {test_metrics['ndcg@10']:.4f}, Precision@10: {test_metrics['precision@10']:.4f}, Recall@10: {test_metrics['recall@10']:.4f}, F1@10: {test_metrics['f1@10']}\n")

In [10]:
run_final_pipeline_full_data(train_data, val_data, test_data, param_grid)

Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.0, 'num_epochs': 20}
Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.0, 'num_epochs': 40}
Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.3, 'num_epochs': 20}
Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.3, 'num_epochs': 40}
Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.0, 'num_epochs': 20}
Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.0, 'num_epochs': 40}
Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.3, 'num_epochs': 20}
Running Config: {'embedding_dim': 16, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.3, 'num_epochs': 40}
Running Config: {'embedding_dim': 64, 'l

# Check Stored Validation and Testing Results

In [11]:
#step5_2_1- NCF Model: Random Embeddings Full Dataset
results_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results'

rand_emb_full_data_val_results = pd.read_csv(os.path.join(results_dir,"NCF Random Embedding/Full Model/val_results_full_dataset.csv"))
print(rand_emb_full_data_val_results.shape)
print(rand_emb_full_data_val_results.head())


rand_emb_by_full_data_test_results = pd.read_csv(os.path.join(results_dir,"NCF Random Embedding/Full Model/test_results_full_dataset.csv"))
print(rand_emb_by_full_data_test_results.shape)
print(rand_emb_by_full_data_test_results.head())

(1, 11)
       rmse       mse  ndcg@10  precision@10  recall@10     f1@10  \
0  0.989009  0.975591  0.99579      0.867375   0.876889  0.870588   

   embedding_dim  learning_rate  batch_size  dropout_rate  num_epochs  
0             64          0.001         128           0.3          40  
(1, 11)
       rmse       mse   ndcg@10  precision@10  recall@10     f1@10  \
0  1.078569  1.162062  0.971801      0.859677   0.934431  0.884957   

   embedding_dim  learning_rate  batch_size  dropout_rate  num_epochs  
0             64          0.001         128           0.3          40  
