# Neural Collaborative Filtering with Custom Embeddings

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import random
import copy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import f1_score

# Build Custom Customer and Product Embedding
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/data'
project_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon'

# Step 1: Load dataset

Previously in step1_data_preprocessing.ipynb, We have split the df_reviews dataset into training, testing and validation samples for each user, following chronological order and using the early 70% of each user's interactions for training, followed by the next 15% for validation and the last 15% for testing.

In [None]:
train_data = pd.read_csv(os.path.join(data_dir,"train_data.csv"))
test_data = pd.read_csv(os.path.join(data_dir,"test_data.csv"))
val_data = pd.read_csv(os.path.join(data_dir,"val_data.csv"))

df_reviews = pd.read_csv(os.path.join(data_dir,"filtered_reviews_with_features_and_clusters.csv"))
print(df_reviews.head())

print(f"Training Data Shape: {train_data.shape}")
print(f"Testing Data Shape: {test_data.shape}")
print(f"Validation Data Shape: {val_data.shape}")

print(train_data.columns)
print(test_data.columns)

print(df_reviews['sentiments'].value_counts())


   customer_id  product_id  product_parent  \
0        11960  B00LCJAW06       219600481   
1        11960  B008OTSEXY       682436048   
2        11960  B00KJ15KGY        32170248   
3        11960  B008ZL49WQ       614364353   
4        11960  B002WRGE5O       928204157   

                                       product_title product_category  \
0  Persian-Rugs T1007 Abstract Modern Area Rug Ca...        Furniture   
1  Flash Furniture High Back Black Ribbed Upholst...        Furniture   
2  Jackson Pollock Inspired Coffee Glass Table w/...        Furniture   
3                                  Eaze Lounge Chair        Furniture   
4         Walker Edison L-Shaped Glass Computer Desk        Furniture   

   star_rating  helpful_votes  total_votes vine verified_purchase  ...  \
0            4              1            1    N                 Y  ...   
1            4              0            0    N                 Y  ...   
2            4              1            1    N               

# Filter out Customers and Products in test and val set that do not appear in training set

Prevent Cold Start problems during validation and testing. If a customer or product appears only in the validation or test set appears only in the validataion or test set, the model has never seen it before and cannot generate a valid prediction. Thus we will remove any rows in the validation or test sets which does not belong to any user in training set or the product is absent in the training set.

We do not remove the rows from the training set.

In [None]:
unique_customers_train = set(train_data['customer_id'].unique())
unique_products_train = set(train_data['product_id'].unique())

val_data = val_data[val_data['customer_id'].isin(unique_customers_train) &
                    val_data['product_id'].isin(unique_products_train)].reset_index(drop=True)

test_data = test_data[test_data['customer_id'].isin(unique_customers_train) &
                      test_data['product_id'].isin(unique_products_train)].reset_index(drop=True)

print(val_data.shape)
print(test_data.shape)

(17174, 26)
(32880, 26)


# Create Data Loader

In [None]:
# Create ID to index mappings
user2idx = {user_id: idx for idx, user_id in enumerate(train_data['customer_id'].unique())}
item2idx = {item_id: idx for idx, item_id in enumerate(train_data['product_id'].unique())}

# Map to new columns
train_data['user_idx'] = train_data['customer_id'].map(user2idx)
train_data['item_idx'] = train_data['product_id'].map(item2idx)
val_data['user_idx'] = val_data['customer_id'].map(user2idx)
val_data['item_idx'] = val_data['product_id'].map(item2idx)
test_data['user_idx'] = test_data['customer_id'].map(user2idx)
test_data['item_idx'] = test_data['product_id'].map(item2idx)

class ReviewsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'customer_id': torch.tensor(row['user_idx'], dtype=torch.long),
            'product_id': torch.tensor(row['item_idx'], dtype=torch.long),
            'rating': torch.tensor(row['star_rating'], dtype=torch.float)
        }

# Create DataLoader
train_dataset = ReviewsDataset(train_data)
test_dataset = ReviewsDataset(test_data)
val_dataset = ReviewsDataset(val_data)



# Building Customer Embeddings

The original **df_reviews** will be used to build the custom customer and product embeddings. These custom embeddings are meant to reflect historical behaviour or characteristics of customers/products

The customer embeddings will constructed by aggregating features within df_reviews by customer_id to find a customer's:
 - Purchase Frequency (Indicate how active a customer is)
 - Time Since Last Purchase (Indicate how active a customer is)
 - Average Star Rating (Overall Customer satisfaction across all of his purchases)
 - Total Vine Reviews (Measure of Credibility of his Reviews)
 - Total Helpful Votes (Measure the Credibility of his Reviews)
 - Total Votes (Measure the Credibility of his Reviews)
 - Average Sentiment (Overall Customer satisfaction across all of his purchases

These embeddings are more informative than a randomly intialized embedding in typical recommnedation systems. The choice of specific features injects domain knowledge into the model.



In [None]:
def build_customer_embeddings(df_reviews, embedding_dim):
    cache_path = os.path.join(project_dir, f"Model Results/NCF Custom Embedding/Full Model/cache/cust_emb_{embedding_dim}.csv")
    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
    if os.path.exists(cache_path):
        #print(f"Using cached customer embeddings from {cache_path}")
        return pd.read_csv(cache_path, index_col="customer_id")

    agg = df_reviews.groupby('customer_id').agg({
        'monthly_purchase_frequency': 'mean',
        'time_since_last_purchase': 'mean',
        'star_rating': 'mean',
        'vine': lambda x: (x == "Y").sum(),
        'helpful_votes': 'sum',
        'total_votes': 'sum',
        'sentiments': lambda x: (x == 'positive').mean()
    }).fillna(0).reset_index()

    cust_ids = agg['customer_id']
    X = StandardScaler().fit_transform(agg.drop(columns='customer_id'))
    num_features = X.shape[1]

    if embedding_dim > num_features:
        raise ValueError(f"Requested embedding_dim={embedding_dim}, but only {num_features} features available.")

    if embedding_dim < num_features:
        pca = PCA(n_components=embedding_dim)
        X = pca.fit_transform(X)
        if (pca.explained_variance_ratio_ > 1e-6).sum() < embedding_dim:
            raise ValueError(f"PCA found fewer than {embedding_dim} meaningful components.")

    df = pd.DataFrame(X, index=cust_ids)
    df.index.name = 'customer_id'
    df.to_csv(cache_path)
    return df

## Building Product Embeddings

The product embeddings will constructed by aggregating features within df_reviews by product_id to find a product's:
- Mean Star Rating that it received (Customer satisfaction)
- Total Helpful Votes given to all its reviews (Quality of customer feedback)
- Total Votes given to all its reviews (Review Engagement by customers)
- Average sentiment (1 is Positive and 0 is Negative)
- Total Number of Vine Reviews (Number of Credible Reviews)
- Total Product Sales (Demand for Product)



In [None]:
def build_product_embeddings(df_reviews, embedding_dim):
    cache_path = os.path.join(project_dir, f"Model Results/NCF Custom Embedding/Full Model/cache/prod_emb_{embedding_dim}.csv")
    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
    if os.path.exists(cache_path):
        #print(f"Using cached product embeddings from {cache_path}")
        return pd.read_csv(cache_path, index_col="product_id")

    agg = df_reviews.groupby('product_id').agg({
        'star_rating': 'mean',
        'helpful_votes': 'sum',
        'total_votes': 'sum',
        'sentiments': lambda x: (x == 'positive').mean(),
        'vine': lambda x: (x == 'Y').sum(),
        'product_id': 'count'  # will be renamed
    }).rename(columns={'product_id': 'sales_volume'}).fillna(0).reset_index()

    prod_ids = agg['product_id']
    X = StandardScaler().fit_transform(agg.drop(columns='product_id'))
    num_features = X.shape[1]

    if embedding_dim > num_features:
        raise ValueError(f"Requested embedding_dim={embedding_dim}, but only {num_features} features available.")

    if embedding_dim < num_features:
        pca = PCA(n_components=embedding_dim)
        X = pca.fit_transform(X)
        if (pca.explained_variance_ratio_ > 1e-6).sum() < embedding_dim:
            raise ValueError(f"PCA found fewer than {embedding_dim} meaningful components.")

    df = pd.DataFrame(X, index=prod_ids)
    df.index.name = 'product_id'
    df.to_csv(cache_path)
    return df


# Define the NCF model with GMF and MLP

NCF class implements a Neural Collaborative Filtering model combining:

- **GMF (Generalized Matrix Factorization)**: Element-wise product of user and item embeddings

- **MLP (Multi-Layer Perceptron)**: Concatenated embeddings passed through FC layers

- **Final prediction**: Merges GMF and MLP outputs to produce a predicted rating (1 to 5 scale)

In [None]:
class NCF(nn.Module):
    def __init__(self, customer_embedding_matrix_gmf, product_embedding_matrix_gmf,
                 customer_embedding_matrix_mlp, product_embedding_matrix_mlp, embedding_dim, dropout_rate = 0.3):
        super(NCF, self).__init__()
        # GMF Components
        self.customer_embeddings_gmf = nn.Embedding.from_pretrained(customer_embedding_matrix_gmf, freeze=False)
        self.product_embeddings_gmf = nn.Embedding.from_pretrained(product_embedding_matrix_gmf, freeze=False)

        # MLP Components
        self.customer_embeddings_mlp = nn.Embedding.from_pretrained(customer_embedding_matrix_mlp, freeze=False)
        self.product_embeddings_mlp = nn.Embedding.from_pretrained(product_embedding_matrix_mlp, freeze=False)

        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.bn1_mlp = nn.BatchNorm1d(128)
        self.dropout1_mlp = nn.Dropout(dropout_rate)

        self.fc2_mlp = nn.Linear(128, 64)
        self.bn2_mlp = nn.BatchNorm1d(64)
        self.dropout2_mlp = nn.Dropout(dropout_rate)

        # Final layers
        self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
        self.bn1_combined = nn.BatchNorm1d(128)
        self.dropout1_combined = nn.Dropout(dropout_rate)

        self.fc2_combined = nn.Linear(128, 1)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, customer_id, product_id):
        # GMF
        customer_emb_gmf = self.customer_embeddings_gmf(customer_id)
        product_emb_gmf = self.product_embeddings_gmf(product_id)
        gmf_output = customer_emb_gmf * product_emb_gmf

        # MLP
        customer_emb_mlp = self.customer_embeddings_mlp(customer_id)
        product_emb_mlp = self.product_embeddings_mlp(product_id)
        mlp_input = torch.cat([customer_emb_mlp, product_emb_mlp], dim=-1)

        mlp_output = F.relu(self.bn1_mlp(self.fc1_mlp(mlp_input)))
        mlp_output = self.dropout1_mlp(mlp_output)

        mlp_output = F.relu(self.bn2_mlp(self.fc2_mlp(mlp_output)))
        mlp_output = self.dropout2_mlp(mlp_output)

        # Combine GMF and MLP
        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = F.relu(self.bn1_combined(self.fc1_combined(combined_input)))
        combined_output = self.dropout1_combined(combined_output)

        # Final layer & output scaling (1–5 range)
        output = self.fc2_combined(combined_output)
        return output.squeeze() * 4 + 1

# NCF Model Trained on Full Dataset




# Evaluation Functions

- **ndcg_at_k**: Computes the Normalized Discounted Cumulative Gain (NDCG) at rank k for a single list of relevance. If the list contains fewer than k items, it will use actual_k = min(k, len(relevances)) to ensure fair computation.

- **mean_ndcg_user_at_k**: Computes the mean NDCG@k across all users by grouping predicted scores and relevance labels per user, sorting by prediction, and applying ndcg_at_k. For each user, their items are sorted by predicted scores, and NDCG is computed using `ndcg_at_k` with actual_k = min(k, len(user_items)).

- **mean_precision_user_at_k**: Computes the mean Precision@k across all users.
Precision@k is the proportion of relevant items (e.g., rating ≥ threshold) among the top-k predicted items for each user. For each user, top-k items are selected based on predicted scores. If the user has fewer than k items, actual_k = min(k, len(user_items)) is used.  
  Precision is calculated as:  
  `precision = (# of relevant items among top-k) / actual_k`  
  where an item is considered relevant if `rating ≥ threshold`.

- **mean_recall_user_at_k**: Computes the mean Recall@k across all users.
Recall@k is the proportion of a user's relevant items (rating ≥ threshold) that are retrieved in the top-k predicted list. For each user, top-k items are selected based on predicted scores, and recall is calculated as:  
  `recall = (# of relevant items among top-k) / total number of relevant items for the user`  
  actual_k = min(k, len(user_items)) is used to handle users with fewer than k items.

- **mean_f1_user_at_k**:  
  Computes the mean F1@k across all users, where F1 combines precision and recall.  
  For each user, top-k items are selected (using actual_k = min(k, len(user_items))), and F1 is calculated based on binarized relevance labels (`rating ≥ threshold`).  
  The predicted labels are assumed to be all 1s (e.g top-k are predicted as relevant).

In [None]:
def ndcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=np.float64)
    actual_k = min(k, len(relevances))
    if actual_k == 0:
        return 0.0
    relevances = relevances[:actual_k]
    dcg = np.sum((2 ** relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum((2 ** ideal_relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def mean_ndcg_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for u, pred, rel in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, rel))
    ndcg_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        relevances = [rel for _, rel in entries_sorted]
        ndcg_list.append(ndcg_at_k(relevances, k))
    return np.mean(ndcg_list) if ndcg_list else 0.0

def mean_precision_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    precision_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]
        rels = [1 if r >= threshold else 0 for _, r in top_k]
        precision_list.append(np.sum(rels) / actual_k if actual_k > 0 else 0)
    return np.mean(precision_list) if precision_list else 0.0

def mean_recall_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    recall_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]

        all_rels = [1 if r >= threshold else 0 for _, r in entries]
        top_k_rels = [1 if r >= threshold else 0 for _, r in top_k]
        total_relevant = np.sum(all_rels)

        if total_relevant == 0:
            recall = 0.0
        else:
            recall = np.sum(top_k_rels) / total_relevant
        recall_list.append(recall)
    return np.mean(recall_list) if recall_list else 0.0

def mean_f1_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, p, l in zip(all_users, all_preds, all_labels):
        user_data[u].append((p, l))

    f1_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        y_true = [int(l >= threshold) for _, l in entries_sorted[:actual_k]]
        y_pred = [1] * actual_k
        f1_list.append(f1_score(y_true, y_pred, zero_division=0))
    return np.mean(f1_list) if f1_list else 0.0

# Grid Search

The grid search algorithm here will perform an exhaustive search to identify the best combination of hyperparamters (embedding_dim, learning_rate, batch_size, dropout_rate, num_epoches) for training the NCF model.

For each configuration:

1. Custom embeddings for users and products are generated using PCA on the training data based on the current embedding_dim.

2. A new NCF model is instantiated with the configuration parameters.

3. The model is trained on the training set and evaluated on the validation set.

4. The best model state (with lowest validation loss) is stored using early stopping.

5. The configuration and model weights are saved if it performs better than all previous configurations.

It will then report the best-performing configuratuon which we will use to train the final model on the combined training and validation data before evaluating it on our test data.


**Embeddings**

During grid search, both customer_embedding and prod_embedding is build using the train_data. The embedding_dim is changing in each iteration of grid search and PCA must be redone with each new embedding dimension.

Note: The maximum embedding_dim for Grid Search is limited by the number of features that I used to form the customer_embedding and product_embedding respectively.


In [None]:
def train_full_model_with_custom_embeddings(train_data, val_data, df_reviews, config, cache):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    embedding_dim = config['embedding_dim']
    key = f"train-{embedding_dim}"
    if key not in cache:
        # cust_emb = torch.tensor(build_customer_embeddings(df_reviews[df_reviews['customer_id'].isin(train_data['customer_id'])], embedding_dim).values, dtype=torch.float32)
        # prod_emb = torch.tensor(build_product_embeddings(df_reviews[df_reviews['product_id'].isin(train_data['product_id'])], embedding_dim).values, dtype=torch.float32)
        cust_emb = torch.tensor(build_customer_embeddings(train_data, embedding_dim).values, dtype=torch.float32)
        prod_emb = torch.tensor(build_product_embeddings(train_data, embedding_dim).values, dtype=torch.float32)
        cache[key] = (cust_emb, prod_emb)
    else:
        cust_emb, prod_emb = cache[key]

    model = NCF(cust_emb, prod_emb, cust_emb.clone(), prod_emb.clone(), embedding_dim=embedding_dim, dropout_rate=config['dropout_rate']).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    criterion = nn.MSELoss()
    train_loader = DataLoader(ReviewsDataset(train_data), batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(ReviewsDataset(val_data), batch_size=config['batch_size'])

    best_val_loss = float('inf')
    best_model_state = None
    best_metrics = {}
    patience_counter = 0

    for epoch in range(config['num_epochs']):
        model.train()
        for batch in train_loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            label = batch['rating'].to(device)

            optimizer.zero_grad()
            output = model(user, item)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss, preds, labels, users = 0, [], [], []
        with torch.no_grad():
            for batch in val_loader:
                user = batch['customer_id'].to(device)
                item = batch['product_id'].to(device)
                label = batch['rating'].to(device)

                output = model(user, item)
                val_loss += criterion(output, label).item()

                preds.extend(output.detach().cpu().numpy())
                labels.extend(label.detach().cpu().numpy())
                users.extend(user.detach().cpu().numpy())

        val_loss /= len(val_loader)
        preds, labels, users = np.array(preds), np.array(labels), np.array(users)
        rmse = np.sqrt(np.mean((preds - labels) ** 2))
        ndcg = mean_ndcg_user_at_k(users, preds, labels, k=10)
        precision = mean_precision_user_at_k(users, preds, labels, k=10, threshold=4)
        recall = mean_recall_user_at_k(users, preds, labels, k=10, threshold=4)
        f1_val = mean_f1_user_at_k(users, preds, labels, k=10, threshold=4)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = copy.deepcopy(model.state_dict())
            best_metrics = {
                'mse': val_loss,
                'rmse': rmse,
                'ndcg@10': ndcg,
                'precision@10': precision,
                'recall@10': recall,
                'f1@10': f1_val
            }
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= config.get('patience', 5):
                break

    model.load_state_dict(best_model_state)
    return model, best_model_state, best_metrics

def evaluate_model(model, test_data, batch_size=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    criterion = nn.MSELoss()

    test_loader = DataLoader(ReviewsDataset(test_data), batch_size=batch_size)
    preds, labels, users = [], [], []
    total_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            label = batch['rating'].to(device)
            output = model(user, item)
            total_loss += criterion(output, label).item()

            preds.extend(output.cpu().numpy())
            labels.extend(label.cpu().numpy())
            users.extend(user.cpu().numpy())

    preds, labels, users = np.array(preds), np.array(labels), np.array(users)
    rmse = np.sqrt(np.mean((preds - labels) ** 2))
    ndcg = mean_ndcg_user_at_k(users, preds, labels, k=10)
    precision = mean_precision_user_at_k(users, preds, labels, k=10, threshold=4)
    recall = mean_recall_user_at_k(users, preds, labels, k=10, threshold=4)
    f1_val = mean_f1_user_at_k(users, preds, labels, k=10, threshold=4)

    return {
        'mse': total_loss / len(test_loader),
        'rmse': rmse,
        'ndcg@10': ndcg,
        'precision@10': precision,
        'recall@10': recall,
        'f1@10': f1_val
    }



### Final Model Training + Evaluation

We will retrain the NCF model using the optimal hyperparameters identified through Grid Search, this time on the combined training and validation data. Lastly, the model is then evaluated on the test data.

**Embeddings**

The final model will be intialized using customer and product embeddings that are forming using the (train_data + val_data) as we want our final model to have the most informed embeddings for evaluation on the test set.

In [None]:
def run_final_pipeline_full_data_custom_embeddings(train_data, val_data, test_data, df_reviews, param_grid, project_dir):
    from itertools import product

    cache = {}
    param_keys = list(param_grid.keys())
    grid_log = []
    best_loss = float('inf')
    best_config = None
    best_state = None
    best_metrics = None

    print("\n======================= GRID SEARCH =======================")
    for values in product(*param_grid.values()):
        config = dict(zip(param_keys, values))
        print(f"Running config: {config}")
        model, state, metrics = train_full_model_with_custom_embeddings(train_data, val_data, df_reviews, config, cache)
        row = metrics.copy()
        row.update(config)
        grid_log.append(row)

        if metrics['mse'] < best_loss:
            best_loss = metrics['mse']
            best_config = config
            best_state = state
            best_metrics = metrics

    pd.DataFrame(grid_log).to_csv(os.path.join(project_dir, "Model Results/NCF Custom Embedding/Full Model/grid_search_log.csv"), index=False)
    pd.DataFrame([{**best_metrics, **best_config}]).to_csv(os.path.join(project_dir, "Model Results/NCF Custom Embedding/Full Model/full_model_validation_results.csv"), index=False)

    final_train = pd.concat([train_data, val_data]).reset_index(drop=True)
    embedding_dim = best_config['embedding_dim']
    cust_final = torch.tensor(build_customer_embeddings(final_train, embedding_dim).values, dtype=torch.float32)
    prod_final = torch.tensor(build_product_embeddings(final_train, embedding_dim).values, dtype=torch.float32)

    torch.save(cust_final, os.path.join(project_dir, "Model Results/NCF Custom Embedding/Full Model/best_customer_embedding.pt"))
    torch.save(prod_final, os.path.join(project_dir, "Model Results/NCF Custom Embedding/Full Model/best_product_embedding.pt"))

    model = NCF(cust_final, prod_final, cust_final.clone(), prod_final.clone(), embedding_dim, best_config['dropout_rate']).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    optimizer = torch.optim.Adam(model.parameters(), lr=best_config['learning_rate'])
    criterion = nn.MSELoss()
    loader = DataLoader(ReviewsDataset(final_train), batch_size=best_config['batch_size'], shuffle=True)

    model.train()
    for epoch in range(best_config['num_epochs']):
        epoch_loss = 0
        for batch in loader:
            user = batch['customer_id'].to(model.device)
            item = batch['product_id'].to(model.device)
            label = batch['rating'].to(model.device)
            optimizer.zero_grad()
            loss = criterion(model(user, item), label)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

    test_metrics = evaluate_model(model, test_data)
    pd.DataFrame([{**test_metrics, **best_config}]).to_csv(
        os.path.join(project_dir, "Model Results/NCF Custom Embedding/Full Model/full_model_testing_results.csv"), index=False
    )

    print("====== Validation Metrics For Full Dataset with Custom Embeddings ======")
    print(f"Best Config: {best_config}")
    print(f"MSE: {best_metrics['mse']:.4f}, RMSE: {best_metrics['rmse']:.4f}, NDCG@10: {best_metrics['ndcg@10']:.4f}, Precision@10: {best_metrics['precision@10']:.4f}, Recall@10: {best_metrics['recall@10']:.4f}, F1@10: {best_metrics['f1@10']}\n")

    print("====== Test Metrics After Final Retraining ======")
    print(f"MSE: {test_metrics['mse']:.4f}, RMSE: {test_metrics['rmse']:.4f}, NDCG@10: {test_metrics['ndcg@10']:.4f}, Precision@10: {test_metrics['precision@10']:.4f}, Recall@10: {test_metrics['recall@10']:.4f}, F1@10: {test_metrics['f1@10']}\n")

    with open(os.path.join(project_dir, "Model Results/NCF Custom Embedding/Full Model/results_full_dataset.txt"), "w") as f:
        f.write("====== Validation Metrics For Full Dataset with Custom Embeddings ======\n")
        f.write(f"Best Config: {best_config}\n")
        f.write(f"MSE: {best_metrics['mse']:.4f}, RMSE: {best_metrics['rmse']:.4f}, NDCG@10: {best_metrics['ndcg@10']:.4f}, Precision@10: {best_metrics['precision@10']:.4f}, Recall@10: {best_metrics['recall@10']:.4f}, F1@10: {best_metrics['f1@10']}\n")
        f.write("====== Test Metrics After Final Retraining ======\n")
        f.write(f"MSE: {test_metrics['mse']:.4f}, RMSE: {test_metrics['rmse']:.4f}, NDCG@10: {test_metrics['ndcg@10']:.4f}, Precision@10: {test_metrics['precision@10']:.4f}, Recall@10: {test_metrics['recall@10']:.4f}, F1@10: {test_metrics['f1@10']}\n")


In [None]:
run_final_pipeline_full_data_custom_embeddings(
    train_data=train_data,
    val_data=val_data,
    test_data=test_data,
    df_reviews=df_reviews,
    param_grid = {
        'embedding_dim': [5,6],             # keep both for low vs high capacity
        'learning_rate': [0.001],           # pick one reliable value
        'batch_size': [128, 512],           # small vs large batch
        'dropout_rate': [0.0, 0.3],         # low vs regular dropout
        'num_epochs': [20, 40]              # moderate vs longer training
    },
    project_dir = project_dir)


Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.0, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.0, 'num_epochs': 40}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.3, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 128, 'dropout_rate': 0.3, 'num_epochs': 40}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.0, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.0, 'num_epochs': 40}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.3, 'num_epochs': 20}
Running config: {'embedding_dim': 5, 'learning_rate': 0.001, 'batch_size': 512, 'dropout_rate': 0.3, 'num_epochs': 40}
Running config: {'embedding_dim': 6, 'learning_

# Check Stored Validation and Testing Results

In [3]:
#step5_1_1- NCF Model: Custom Embedding Full Dataset
results_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results'
cust_emb_full_data_val_results = pd.read_csv(os.path.join(results_dir,"NCF Custom Embedding/Full Model/full_model_validation_results.csv"))
print(cust_emb_full_data_val_results.shape)
print("==============step5_1_1- NCF Model: Custom Embedding Full Dataset Validation Results===============")
display(cust_emb_full_data_val_results.head())


cust_emb_by_full_data_test_results = pd.read_csv(os.path.join(results_dir,"NCF Custom Embedding/Full Model/full_model_testing_results.csv"))
print(cust_emb_by_full_data_test_results.shape)
print("==============step5_1_1- NCF Model: Custom Embedding Full Dataset Testing Results===============")
display(cust_emb_by_full_data_test_results.head())

(1, 11)


Unnamed: 0,mse,rmse,ndcg@10,precision@10,recall@10,f1@10,embedding_dim,learning_rate,batch_size,dropout_rate,num_epochs
0,0.892829,0.94476,0.996364,0.867375,0.876889,0.870588,6,0.001,128,0.0,20


(1, 11)


Unnamed: 0,mse,rmse,ndcg@10,precision@10,recall@10,f1@10,embedding_dim,learning_rate,batch_size,dropout_rate,num_epochs
0,1.859998,1.192249,0.971615,0.859671,0.934425,0.884954,6,0.001,128,0.0,20
