# 5.3 NCF Negative Sampling

## 5.3.1 Load Dataset
---
The following code will load in the full dataset from `filtered_reviews_With_features_and_clusters.csv` as well as the training, testing and validation datasets from `train_data.csv`, `test_data.csv` and `val_data.csv`.

In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import f1_score


# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define project paths
project_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon'
data_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/data'

Mounted at /content/drive


In [None]:
train_data = pd.read_csv(os.path.join(data_dir,"train_data.csv"))
test_data = pd.read_csv(os.path.join(data_dir,"test_data.csv"))
val_data = pd.read_csv(os.path.join(data_dir,"val_data.csv"))

df_reviews = pd.read_csv(os.path.join(data_dir,"filtered_reviews_with_features_and_clusters.csv"))

## 5.3.2 Data Preparation
---
The following code converts raw user and item IDs into numerical indices so that they can be used as input to a neural network and generates artificial negative samples (items the user hasn't interacted with) to help the model learn what not to recommend.

NCF models are often used for recommendation systems, and they need to know not just what a user liked (positive samples), but also what they likely weren’t interested in (negative samples). Since real-world data doesn’t always have explicit dislikes, this technique (negative sampling) simulates those negatives to help the model learn more effectively.

In [None]:
# Create ID to index mappings
user2idx = {user_id: idx for idx, user_id in enumerate(train_data['customer_id'].unique())}
item2idx = {item_id: idx for idx, item_id in enumerate(train_data['product_id'].unique())}

# Map to new columns
train_data['user_idx'] = train_data['customer_id'].map(user2idx)
train_data['item_idx'] = train_data['product_id'].map(item2idx)
val_data['user_idx'] = val_data['customer_id'].map(user2idx)
val_data['item_idx'] = val_data['product_id'].map(item2idx)
test_data['user_idx'] = test_data['customer_id'].map(user2idx)
test_data['item_idx'] = test_data['product_id'].map(item2idx)

In [None]:
def generate_negative_samples(df, num_neg=1.0):
    all_users = df['customer_id'].unique()
    all_items = df['product_id'].unique()
    user_item_set = set(zip(df['customer_id'], df['product_id']))

    negatives = []
    for user in tqdm(all_users, desc='Generating negatives'):
        user_known_items = df[df['customer_id'] == user]['product_id'].tolist()
        unknown_items = list(set(all_items) - set(user_known_items))
        n_samples = int(len(user_known_items) * num_neg)
        sampled = np.random.choice(unknown_items, size=n_samples, replace=False)
        for item in sampled:
            negatives.append((user, item, 0.0))

    return pd.DataFrame(negatives, columns=['customer_id', 'product_id', 'rating'])

In [None]:
# Training data
positive_df = train_data[['customer_id', 'product_id']].copy()
positive_df['rating'] = 1.0
negative_df = generate_negative_samples(positive_df, num_neg=1.0)
train_data_with_neg = pd.concat([positive_df, negative_df], ignore_index=True)

# Testing data
positive_test_df = test_data[['customer_id', 'product_id']].copy()
positive_test_df['rating'] = 1.0
negative_test_df = generate_negative_samples(positive_test_df, num_neg=1.0)
test_data_with_neg = pd.concat([positive_test_df, negative_test_df], ignore_index=True)

# Validation data
positive_val_df = val_data[['customer_id', 'product_id']].copy()
positive_val_df['rating'] = 1.0
negative_val_df = generate_negative_samples(positive_val_df, num_neg=1.0)
val_data_with_neg = pd.concat([positive_val_df, negative_val_df], ignore_index=True)

Generating negatives: 100%|██████████| 18484/18484 [03:26<00:00, 89.36it/s]
Generating negatives: 100%|██████████| 18484/18484 [02:15<00:00, 136.76it/s]
Generating negatives: 100%|██████████| 18484/18484 [01:12<00:00, 254.64it/s]


In [None]:
print(f"Original training set size: {len(train_data)}")
print(f"After negative sampling:    {len(train_data_with_neg)}")

print(f"Original testing set size: {len(test_data)}")
print(f"After negative sampling:   {len(test_data_with_neg)}")

print(f"Original validation set size: {len(val_data)}")
print(f"After negative sampling:      {len(val_data_with_neg)}")

Original training set size: 116086
After negative sampling:    232172
Original testing set size: 43223
After negative sampling:    86446
Original validation set size: 21664
After negative sampling:    43328


In [None]:
train_data_with_neg['user_idx'] = train_data_with_neg['customer_id'].map(user2idx)
train_data_with_neg['item_idx'] = train_data_with_neg['product_id'].map(item2idx)

val_data_with_neg['user_idx'] = val_data_with_neg['customer_id'].map(user2idx)
val_data_with_neg['item_idx'] = val_data_with_neg['product_id'].map(item2idx)

test_data_with_neg['user_idx'] = test_data_with_neg['customer_id'].map(user2idx)
test_data_with_neg['item_idx'] = test_data_with_neg['product_id'].map(item2idx)

train_data_with_neg.dropna(subset=['user_idx', 'item_idx'], inplace=True)
val_data_with_neg.dropna(subset=['user_idx', 'item_idx'], inplace=True)
test_data_with_neg.dropna(subset=['user_idx', 'item_idx'], inplace=True)

train_data_with_neg['user_idx'] = train_data_with_neg['user_idx'].astype(int)
train_data_with_neg['item_idx'] = train_data_with_neg['item_idx'].astype(int)
val_data_with_neg['user_idx'] = val_data_with_neg['user_idx'].astype(int)
val_data_with_neg['item_idx'] = val_data_with_neg['item_idx'].astype(int)
test_data_with_neg['user_idx'] = test_data_with_neg['user_idx'].astype(int)
test_data_with_neg['item_idx'] = test_data_with_neg['item_idx'].astype(int)


## 5.3.3 Data Loader with Negative Sampling
---
We wrap our training, validation, and test data into a custom AmazonDataset class to prepare them for model training. Each sample includes user index, item index, and rating. Train loader uses shuffled data with negative samples, whereas the Validation/Test loaders use unshuffled data for evaluation.


In [None]:
class AmazonDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'customer_id': torch.tensor(row['user_idx'], dtype=torch.long),
            'product_id': torch.tensor(row['item_idx'], dtype=torch.long),
            'rating': torch.tensor(row['rating'], dtype=torch.float)
        }

batch_size = 512

train_dataset_with_neg = AmazonDataset(train_data_with_neg)
test_dataset_with_neg = AmazonDataset(test_data_with_neg)
val_dataset_with_neg = AmazonDataset(val_data_with_neg)

train_loader_with_neg = DataLoader(train_dataset_with_neg, batch_size=batch_size, shuffle=True)
test_loader_with_neg = DataLoader(test_dataset_with_neg, batch_size=batch_size, shuffle=False)
val_loader_with_neg = DataLoader(val_dataset_with_neg, batch_size=batch_size, shuffle=False)

## 5.3.4 Create Model
---
We implement the NCF architecture that combines:

*   GMF (Generalized Matrix Factorization): Element-wise product of user and item embeddings
*   MLP (Multi-Layer Perceptron): Deep interaction modeling using concatenated embeddings

These are merged and passed through fully connected layers to predict interaction probabilities.

In [None]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(NCF, self).__init__()
        self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings_gmf = nn.Embedding(num_items, embedding_dim)

        self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings_mlp = nn.Embedding(num_items, embedding_dim)

        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.fc2_mlp = nn.Linear(128, 64)

        self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
        self.fc2_combined = nn.Linear(128, 1)

        self.dropout = nn.Dropout(0.2)

    def forward(self, user_id, item_id):
        user_emb_gmf = self.user_embeddings_gmf(user_id)
        item_emb_gmf = self.item_embeddings_gmf(item_id)
        gmf_output = user_emb_gmf * item_emb_gmf

        user_emb_mlp = self.user_embeddings_mlp(user_id)
        item_emb_mlp = self.item_embeddings_mlp(item_id)
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp], dim=-1)
        mlp_output = self.dropout(torch.relu(self.fc1_mlp(mlp_input)))
        mlp_output = self.dropout(torch.relu(self.fc2_mlp(mlp_output)))

        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = self.dropout(torch.relu(self.fc1_combined(combined_input)))
        prediction = torch.sigmoid(self.fc2_combined(combined_output))

        return prediction.squeeze()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_users = len(user2idx)
num_items = len(item2idx)
embedding_dim = 32  # you can tune this

model = NCF(num_users, num_items, embedding_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## 5.3.5 Training the NCF Model
---
We define a train_model function to train the NCF model over several epochs using the training DataLoader. For each batch, it computes predictions, calculates loss, and updates weights via backpropagation.

In [None]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model = model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            label = batch['rating'].to(device)

            optimizer.zero_grad()
            prediction = model(user, item)
            loss = criterion(prediction, label)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f}")

In [None]:
train_model(model, train_loader_with_neg, criterion, optimizer, device, num_epochs=10)

Epoch 1/10 | Loss: 0.2480
Epoch 2/10 | Loss: 0.2303
Epoch 3/10 | Loss: 0.2130
Epoch 4/10 | Loss: 0.1974
Epoch 5/10 | Loss: 0.1808
Epoch 6/10 | Loss: 0.1624
Epoch 7/10 | Loss: 0.1427
Epoch 8/10 | Loss: 0.1227
Epoch 9/10 | Loss: 0.1033
Epoch 10/10 | Loss: 0.0856


## 5.3.6 Evaluation Functions
---
The evaluation functions and metrics are the same as what was explained in `step5_1_1 - NCF Model: Custom Embedding Full Dataset.ipynb`.

- **ndcg_at_k**: Computes the Normalized Discounted Cumulative Gain (NDCG) at rank k for a single list of relevance. If the list contains fewer than k items, it will use actual_k = min(k, len(relevances)) to ensure fair computation.

- **mean_ndcg_user_at_k**: Computes the mean NDCG@k across all users by grouping predicted scores and relevance labels per user, sorting by prediction, and applying ndcg_at_k. For each user, their items are sorted by predicted scores, and NDCG is computed using `ndcg_at_k` with actual_k = min(k, len(user_items)).

- **mean_precision_user_at_k**: Computes the mean Precision@k across all users.
Precision@k is the proportion of relevant items (e.g., rating ≥ threshold) among the top-k predicted items for each user. For each user, top-k items are selected based on predicted scores. If the user has fewer than k items, actual_k = min(k, len(user_items)) is used.  
  Precision is calculated as:  
  `precision = (# of relevant items among top-k) / actual_k`  
  where an item is considered relevant if `rating ≥ threshold`.

- **mean_recall_user_at_k**: Computes the mean Recall@k across all users.
Recall@k is the proportion of a user's relevant items (rating ≥ threshold) that are retrieved in the top-k predicted list. For each user, top-k items are selected based on predicted scores, and recall is calculated as:  
  `recall = (# of relevant items among top-k) / total number of relevant items for the user`  
  actual_k = min(k, len(user_items)) is used to handle users with fewer than k items.

- **mean_f1_user_at_k**:  
  Computes the mean F1@k across all users, where F1 combines precision and recall.  
  For each user, top-k items are selected (using actual_k = min(k, len(user_items))), and F1 is calculated based on binarized relevance labels (`rating ≥ threshold`).  
  The predicted labels are assumed to be all 1s (e.g top-k are predicted as relevant).

In [None]:
def ndcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=np.float64)
    actual_k = min(k, len(relevances))
    if actual_k == 0:
        return 0.0
    relevances = relevances[:actual_k]
    dcg = np.sum((2 ** relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum((2 ** ideal_relevances - 1) / np.log2(np.arange(2, actual_k + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def mean_ndcg_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for u, pred, rel in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, rel))
    ndcg_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        relevances = [rel for _, rel in entries_sorted]
        ndcg_list.append(ndcg_at_k(relevances, k))
    return np.mean(ndcg_list) if ndcg_list else 0.0

def mean_precision_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    precision_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]
        rels = [1 if r >= threshold else 0 for _, r in top_k]
        precision_list.append(np.sum(rels) / actual_k if actual_k > 0 else 0)
    return np.mean(precision_list) if precision_list else 0.0

def mean_recall_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, pred, label in zip(all_users, all_preds, all_labels):
        user_data[u].append((pred, label))

    recall_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        top_k = entries_sorted[:actual_k]

        all_rels = [1 if r >= threshold else 0 for _, r in entries]
        top_k_rels = [1 if r >= threshold else 0 for _, r in top_k]
        total_relevant = np.sum(all_rels)

        if total_relevant == 0:
            recall = 0.0
        else:
            recall = np.sum(top_k_rels) / total_relevant
        recall_list.append(recall)
    return np.mean(recall_list) if recall_list else 0.0

def mean_f1_user_at_k(all_users, all_preds, all_labels, k=10, threshold=4):
    user_data = defaultdict(list)
    for u, p, l in zip(all_users, all_preds, all_labels):
        user_data[u].append((p, l))

    f1_list = []
    for entries in user_data.values():
        entries_sorted = sorted(entries, key=lambda x: x[0], reverse=True)
        actual_k = min(k, len(entries_sorted))
        y_true = [int(l >= threshold) for _, l in entries_sorted[:actual_k]]
        y_pred = [1] * actual_k
        f1_list.append(f1_score(y_true, y_pred, zero_division=0))
    return np.mean(f1_list) if f1_list else 0.0

In [None]:
def evaluate_model(model, test_data, batch_size=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    criterion = nn.BCELoss()  # Use BCELoss for implicit feedback

    test_loader = DataLoader(AmazonDataset(test_data), batch_size=batch_size)
    preds, labels, users = [], [], []
    total_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            user = batch['customer_id'].to(device)
            item = batch['product_id'].to(device)
            label = batch['rating'].to(device)
            output = model(user, item)  # Already has sigmoid inside

            total_loss += criterion(output, label).item()

            preds.extend(output.cpu().numpy())
            labels.extend(label.cpu().numpy())
            users.extend(user.cpu().numpy())

    preds = np.array(preds)
    labels = np.array(labels)
    users = np.array(users)

    rmse = np.sqrt(np.mean((preds - labels) ** 2))
    ndcg = mean_ndcg_user_at_k(users, preds, labels, k=10)
    precision = mean_precision_user_at_k(users, preds, labels, k=10, threshold=0.5)
    recall = mean_recall_user_at_k(users, preds, labels, k=10, threshold=0.5)
    f1_val = mean_f1_user_at_k(users, preds, labels, k=10, threshold=0.5)

    return {
        'mse': total_loss / len(test_loader),
        'rmse': rmse,
        'ndcg@10': ndcg,
        'precision@10': precision,
        'recall@10': recall,
        'f1@10': f1_val
    }

In [None]:
def get_predictions_dataframe(model, data_df, batch_size=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()

    loader = DataLoader(AmazonDataset(data_df), batch_size=batch_size)
    results = []

    with torch.no_grad():
        for batch in loader:
            user_idx = batch['customer_id'].to(device)
            item_idx = batch['product_id'].to(device)
            rating = batch['rating'].to(device)

            preds = model(user_idx, item_idx).cpu().numpy()
            actuals = rating.cpu().numpy()
            user_idx_np = user_idx.cpu().numpy()
            item_idx_np = item_idx.cpu().numpy()

            for u, i, p, r in zip(user_idx_np, item_idx_np, preds, actuals):
                results.append([u, i, p, r])

    df = pd.DataFrame(results, columns=['user_idx', 'item_idx', 'predicted_score', 'actual_label'])

    # Map back to original customer_id and product_id
    inv_user_map = {v: k for k, v in user2idx.items()}
    inv_item_map = {v: k for k, v in item2idx.items()}

    df['customer_id'] = df['user_idx'].map(inv_user_map)
    df['product_id'] = df['item_idx'].map(inv_item_map)
    return df[['customer_id', 'product_id', 'predicted_score', 'actual_label']]

In [None]:
# Get prediction DataFrames
val_preds_df = get_predictions_dataframe(model, val_data_with_neg)
test_preds_df = get_predictions_dataframe(model, test_data_with_neg)

# Save to CSV
results_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results'

val_preds_df.to_csv(os.path.join(results_dir, "NCF Negative Sampling/full_model_val_predictions.csv"), index=False)
test_preds_df.to_csv(os.path.join(results_dir, "NCF Negative Sampling/full_model_test_predictions.csv"), index=False)

print("✅ Predictions saved to:")
print(f"- Validation: {results_dir}/full_model_val_predictions.csv")
print(f"- Testing:    {results_dir}/full_model_test_predictions.csv")


✅ Predictions saved to:
- Validation: /content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results/val_predictions.csv
- Testing:    /content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results/test_predictions.csv


In [None]:
# Get Evaluation DataFrames
val_eval_df = pd.DataFrame([evaluate_model(model, val_data_with_neg)])
test_eval_df = pd.DataFrame([evaluate_model(model, test_data_with_neg)])

# Save to CSV
results_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results'

val_eval_df.to_csv(os.path.join(results_dir, "NCF Negative Sampling/full_model_validation_results.csv"), index=False)
test_eval_df.to_csv(os.path.join(results_dir, "NCF Negative Sampling/full_model_testing_results.csv"), index=False)

print("✅ Evaluation results saved to:")
print(f"- Validation: {results_dir}/NCF Negative Sampling/full_model_validation_results.csv")
print(f"- Testing:    {results_dir}/NCF Negative Sampling/full_model_testing_results.csv")


✅ Evaluation results saved to:
- Validation: /content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results/NCF Negative Sampling/full_model_validation_results.csv
- Testing:    /content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/Model Results/NCF Negative Sampling/full_model_testing_results.csv
