In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import copy
import os
from collections import defaultdict
import random
import itertools
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import ndcg_score as sk_ndcg

# Build Custom Customer and Product Embedding
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/data'
project_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon'

# Loading and splitting the data

Previously in step1_data_preprocessing.ipynb, We have split the df_reviews dataset into training, testing and validation samples for each user, by grouping customers together then following chronological order and using the early 70% of each user's interactions for training, followed by the next 15% for validation and the last 15% for testing.

In [3]:
df_reviews = pd.read_csv(os.path.join(data_dir,"filtered_reviews_with_features_and_clusters.csv"))

In [4]:
print(df_reviews.head())

   customer_id  product_id  product_parent  \
0        11960  B00LCJAW06       219600481   
1        11960  B008OTSEXY       682436048   
2        11960  B00KJ15KGY        32170248   
3        11960  B008ZL49WQ       614364353   
4        11960  B002WRGE5O       928204157   

                                       product_title product_category  \
0  Persian-Rugs T1007 Abstract Modern Area Rug Ca...        Furniture   
1  Flash Furniture High Back Black Ribbed Upholst...        Furniture   
2  Jackson Pollock Inspired Coffee Glass Table w/...        Furniture   
3                                  Eaze Lounge Chair        Furniture   
4         Walker Edison L-Shaped Glass Computer Desk        Furniture   

   star_rating  helpful_votes  total_votes vine verified_purchase  ...  \
0            4              1            1    N                 Y  ...   
1            4              0            0    N                 Y  ...   
2            4              1            1    N               

# Mapping IDs in df_reviews

ID mapping for users and products in the df_reviews DataFrame, creating new columns for numerical indices corresponding to each unique user and product.

In [5]:
user2idx = {user_id: idx + 1 for idx, user_id in enumerate(df_reviews['customer_id'].unique())}
item2idx = {item_id: idx + 1 for idx, item_id in enumerate(df_reviews['product_id'].unique())}

df_reviews['user_idx'] = df_reviews['customer_id'].map(user2idx)
df_reviews['item_idx'] = df_reviews['product_id'].map(item2idx)

# Splitting the data for LSTM

- **Padding**: Sequences are padded to ensure a fixed length.
- **Data Splitting**: Users are split into training, validation, and test sets (64%, 16%, 20%).

#### Formation of Training, Validation, and Testing Data
1. **Training Data**: From the data, we select the 80% of the interactions to generate sequences. These sequences, along with positive samples (items the user interacted with), and negative samples (randomly selected items the user did not interact with), are created and added to the training data. These sequences allow the model to learn patterns in user behavior.

2. **Validation Data**: 16% of the interactions are used for validation. Similar to training data, we generate sequences, positive and negative samples, to evaluate how well the model generalizes to unseen data during training.

3. **Testing Data**: 24% of the data is used for testing. For each user in the testing data, the first 80% of interactions are used as model input and last 20% as positive labels. 10 negative items are also sampled. The model does not see this data during training. Positive samples are taken from the user's reviews that were held out for testing, and multiple negative samples are randomly chosen from items the user hasn't interacted with.

This approach ensures that the model is trained on sequences of past interactions, with positive samples (items the user interacted with) and negative samples (items the user did not interact with), preventing the model from seeing any cold-start users or products in the validation and test sets. Also ensures that the model is trained, validated, and tested on distinct sets of data, avoiding overfitting and ensuring generalizability.

In [6]:
max_len = 10 # max seq length for user-product interactions
min_seq_length = 5 # min 5 sequences
max_interactions = 20 # limits to 20 interactions

df_sorted = df_reviews.sort_values(by=['user_idx', 'review_date'])
grouped = df_sorted.groupby('user_idx')

all_items = np.array(df_sorted['item_idx'].unique())
filtered_users = [uid for uid, grp in grouped if len(grp) >= min_seq_length]

# 64% train, 16% val, 20% test
train_val_users, test_users = train_test_split(filtered_users, test_size=0.2, random_state=42)
train_users, val_users = train_test_split(train_val_users, test_size=0.2, random_state=42)

def pad_left(seq, max_len, pad_value=0):
    return [pad_value] * (max_len - len(seq)) + seq

train_data, val_data, test_data = [], [], []

for uid in filtered_users: # for each user with at least 5 interactions
    group = grouped.get_group(uid).sort_values('review_date').iloc[:max_interactions]
    items = group['item_idx'].tolist()
    user_item_set = set(items)
    neg_pool = np.setdiff1d(all_items, list(user_item_set), assume_unique=True) # for negative sampling

    split_point = int(0.8 * len(items))

    for i in range(1, split_point):
        seq = items[max(0, i - max_len):i]
        if len(seq) < 4:
            continue

        seq_padded = pad_left(seq, max_len)
        pos_item = items[i]
        neg_item = np.random.choice(neg_pool)

        if uid in train_users: # training set
            train_data.extend([
                (uid, seq_padded, pos_item, 1),
                (uid, seq_padded, neg_item, 0)
            ])
        elif uid in val_users: # validation set
            val_data.extend([
                (uid, seq_padded, pos_item, 1),
                (uid, seq_padded, neg_item, 0)
            ])

    if uid in test_users: # test set
        seq = items[max(0, split_point - max_len):split_point]
        if len(seq) < 4:
            continue

        seq_padded = pad_left(seq, max_len)
        # test positives
        pos_items = items[split_point:]

        for pos_item in pos_items:
            test_data.append((uid, seq_padded, pos_item, 1))

        # 10 negative samples
        neg_items = np.random.choice(neg_pool, size=10, replace=False)
        for neg_item in neg_items:
            test_data.append((uid, seq_padded, neg_item, 0))

print(f"Train Samples: {len(train_data)}")
print(f"Val Samples: {len(val_data)}")
print(f"Test Samples: {len(test_data)}")

Train Samples: 77002
Val Samples: 19204
Test Samples: 111834


In [7]:
import random
test_user_ids = list(set([row[0] for row in test_data]))
# Randomly sample 20% of test sample
sampled_users = set(random.sample(test_user_ids, int(len(test_user_ids) * 0.2)))
reduced_test_data = [row for row in test_data if row[0] in sampled_users]
test_data = reduced_test_data

print(f"Reduced to {len(test_data)} test samples from {len(sampled_users)} users.")

Reduced to 22417 test samples from 1906 users.


# Creating DataLoader

In [8]:
class PurchaseSequenceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        uid, seq, candidate, label = self.data[idx][:4]
        return (
            torch.LongTensor(seq),
            torch.LongTensor([candidate]),
            torch.FloatTensor([label]),
            torch.LongTensor([uid])
        )

batch_size = 64

train_dataset = PurchaseSequenceDataset(train_data)
val_dataset   = PurchaseSequenceDataset(val_data)
test_dataset  = PurchaseSequenceDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Training Data Size: {len(train_loader.dataset)}")
print(f"Validation Data Size: {len(val_loader.dataset)}")
print(f"Testing Data Size: {len(test_loader.dataset)}")

Training Data Size: 77002
Validation Data Size: 19204
Testing Data Size: 22417


# Defining the LSTM Model
  
- **Objective**: Predict whether a user will purchase a candidate product based on past interactions (purchase history)
- **Inputs**: User's past purchases; Candidate product
- **Architecture**:  
  - **Embedding Layers**: Convert user and candidate item indices into dense vectors  
  - **LSTM**: Processes the user's purchase sequence to capture sequential dependencies  
  - **Hidden State**: The output from LSTM is combined with the candidate's embedding  
  - **Fully Connected Layer**: Computes a single logit representing the purchase probability  
- **Activation**: **Sigmoid**: Converts the logit into a probability between 0 and 1  
- **Objective**: Binary classification (purchase or not purchase)


In [9]:
class PurchaseCandidateLSTM(nn.Module):
    def __init__(self, num_items, embed_dim, hidden_dim, dropout, num_layers):
        super(PurchaseCandidateLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.dropout = nn.Dropout(dropout)
        self.candidate_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embed_dim, padding_idx=0) # embedding for candidate items
        self.fc = nn.Linear(hidden_dim + embed_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, seq, candidate):
        seq_embedded = self.embedding(seq)                      # (batch_size, seq_len, embed_dim)
        lstm_out, _ = self.lstm(seq_embedded)                   # (batch_size, seq_len, hidden_dim)
        seq_repr = self.dropout(lstm_out[:, -1, :])             # (batch_size, hidden_dim)
        candidate_emb = self.candidate_embedding(candidate).squeeze(1)  # (batch_size, embed_dim)
        combined = torch.cat([seq_repr, candidate_emb], dim=1)  # (batch_size, hidden_dim + embed_dim)
        logits = self.fc(combined)                              # (batch_size, 1)
        return self.sigmoid(logits)                             # (batch_size, 1)

    def init_hidden(self, batch_size, device):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device) # hidden state
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device) # context state
        return (h0, c0)

# Evaluation Metrics Function

- ndcg_at_k: Computes the NDCG for a single list of relevance labels at rank k.
- mean_ndcg_user_at_k: Computes the mean NDCG@k across all users.
- mean_precision_user_at_k: Computes the mean Precision@k across users, where Precision@k is the proportion of relevant items in the top k predictions.
- mean_recall_user_at_k: Computes the mean Recall@k across users, where Recall@k is the proportion of relevant items retrieved in the top k predictions.
- mean_f1_user_at_k: Computes the mean F1 score at k across users, comparing the predicted top k items with the true labels.

In [10]:
def acc(outputs, labels):
    predicted = (outputs.squeeze() >= 0.5).float()
    return (predicted == labels.squeeze()).sum().item()

def ndcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=np.float64)[:k]
    if relevances.size == 0:
        return 0.0
    dcg = np.sum((2 ** relevances - 1) / np.log2(np.arange(2, relevances.size + 2)))
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum((2 ** ideal_relevances - 1) / np.log2(np.arange(2, ideal_relevances.size + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def mean_ndcg_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for user, pred, label in zip(all_users, all_preds, all_labels):
        user_data[user].append((pred, label))

    ndcg_scores = []
    for user, entries in user_data.items():
        ranked = sorted(entries, key=lambda x: x[0], reverse=True)
        relevances = [label for _, label in ranked]
        dcg = ndcg_at_k(relevances, k)
        ndcg_scores.append(dcg)

    return np.mean(ndcg_scores)

def mean_precision_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for user, pred, label in zip(all_users, all_preds, all_labels):
        user_data[user].append((pred, label))

    precision_list = []
    for user, entries in user_data.items():
        ranked = sorted(entries, key=lambda x: x[0], reverse=True)
        top_k = ranked[:k]
        precision = sum(label for _, label in top_k) / k
        precision_list.append(precision)

    return np.mean(precision_list) if precision_list else 0.0

def mean_recall_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for user, pred, label in zip(all_users, all_preds, all_labels):
        user_data[user].append((pred, label))

    recall_list = []
    for user, entries in user_data.items():
        total_positives = sum(label for _, label in entries)
        if total_positives == 0:
            recall_list.append(0.0)
            continue

        ranked = sorted(entries, key=lambda x: x[0], reverse=True)
        top_k = ranked[:k]
        retrieved_positives = sum(label for _, label in top_k)
        recall = retrieved_positives / total_positives
        recall_list.append(recall)

    return np.mean(recall_list)


from sklearn.metrics import f1_score

def mean_f1_user_at_k(all_users, all_preds, all_labels, k=10):
    user_data = defaultdict(list)
    for user, pred, label in zip(all_users, all_preds, all_labels):
        user_data[user].append((pred, label))

    f1_list = []
    for user, entries in user_data.items():
        ranked = sorted(entries, key=lambda x: x[0], reverse=True)
        top_k = ranked[:k]
        y_true = [label for _, label in top_k]
        y_pred = [1] * len(top_k)
        if sum(y_true) == 0:
            f1_list.append(0.0)
        else:
            f1_list.append(f1_score(y_true, y_pred, zero_division=0))

    return np.mean(f1_list)

# Training and Evaluation

In [21]:
def train_lstm_model(model, train_loader, val_loader, device, epochs, lr, clip, patience=3):
    best_val_loss = float('inf')
    patience_counter = 0

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

    for epoch in range(epochs):
        model.train()
        train_loss, train_correct, total_train = 0, 0, 0

        for inputs, candidate, labels, _ in train_loader:
            inputs, candidate, labels = inputs.to(device), candidate.to(device), labels.to(device)

            optimizer.zero_grad()
            output = model(inputs, candidate)
            loss = criterion(output, labels.float())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            train_loss += loss.item()
            train_correct += acc(output, labels)
            total_train += labels.size(0)

        train_acc = train_correct / total_train
        avg_train_loss = train_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss, val_correct, total_val = 0, 0, 0
        with torch.no_grad():
            for inputs, candidate, labels, _ in val_loader:
                inputs, candidate, labels = inputs.to(device), candidate.to(device), labels.to(device)
                output = model(inputs, candidate)
                loss = criterion(output, labels.float())
                val_loss += loss.item()
                val_correct += acc(output, labels)
                total_val += labels.size(0)

        val_acc = val_correct / total_val
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"\tTrain Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        print(f"\tTrain Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%")

        # Early stopping check, when patience = 3
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs.")
                break

In this method, we evaluate the LSTM-based binary classification model that is designed to predict the probability that a user wil purchase each candidate product, given their past purchase sequence. During evaluation, the model scores candidate products for users, including both positive (purchased) and negative (not purchased) items. It then ranks these candidates by predicted probability and computes ranking-based metrics: Precision@K, Recall@K, F1@K, and NDCG@K.

In [12]:
import inspect
def evaluate_model(model, test_loader, device, k=10, threshold=0.5, set_name="test"):
    model.eval()
    correct, total = 0, 0
    all_users, all_preds, all_labels = [], [], []
    user_results = defaultdict(lambda: {'preds': [], 'labels': []})
    criterion = nn.BCELoss()
    total_loss = 0

    expects_user_id = 'user_id' in inspect.signature(model.forward).parameters

    with torch.no_grad():
        for inputs, candidate, labels, user_ids in test_loader:
            inputs, candidate, labels = inputs.to(device), candidate.to(device), labels.to(device)

            if expects_user_id:
                output = model(inputs, candidate, user_ids)
            else:
                output = model(inputs, candidate)

            loss = criterion(output, labels.float())
            total_loss += loss.item()

            preds = output.squeeze().cpu().numpy()
            labs = labels.squeeze().cpu().numpy()
            user_ids = user_ids.squeeze().cpu().numpy()

            preds_binary = (output.squeeze() >= 0.5).float()
            correct += (preds_binary == labels.squeeze()).sum().item()
            total += labels.size(0)

            for u, p, l in zip(user_ids, preds, labs):
                user_results[int(u)]['preds'].append(p)
                user_results[int(u)]['labels'].append(l)

            all_users.extend(user_ids)
            all_preds.extend(preds)
            all_labels.extend(labs)

    # Flat metrics
    binarized_preds = (np.array(all_preds) >= 0.5).astype(int)
    flat_precision = precision_score(all_labels, binarized_preds, zero_division=0)
    flat_recall = recall_score(all_labels, binarized_preds, zero_division=0)
    flat_f1 = f1_score(all_labels, binarized_preds, zero_division=0)

    # Ranking metrics
    prec_at_k = mean_precision_user_at_k(all_users, all_preds, all_labels, k=k)
    rec_at_k = mean_recall_user_at_k(all_users, all_preds, all_labels, k=k)
    f1_at_k = mean_f1_user_at_k(all_users, all_preds, all_labels, k=k)
    ndcg_at_k_val = mean_ndcg_user_at_k(all_users, all_preds, all_labels, k=k)

    # Aggregate & Log
    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total

    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Flat Precision: {flat_precision:.4f} | Recall: {flat_recall:.4f} | F1: {flat_f1:.4f}")
    print(f"Precision@{k}: {prec_at_k:.4f} | Recall@{k}: {rec_at_k:.4f} | F1@{k}: {f1_at_k:.4f} | NDCG@{k}: {ndcg_at_k_val:.4f}")

    metrics = {
        'val_acc': accuracy,
        'loss': avg_loss,
        'flat_precision': flat_precision,
        'flat_recall': flat_recall,
        'flat_f1': flat_f1,
        f'precision@{k}': prec_at_k,
        f'recall@{k}': rec_at_k,
        f'f1@{k}': f1_at_k,
        f'ndcg@{k}': ndcg_at_k_val,
    }

    # Build path and save
    filename = f"{set_name}_evaluation_results.csv"
    output_path = os.path.join(project_dir, "Model Results", "LSTM", filename)
    pd.DataFrame([metrics]).to_csv(output_path, index=False)
    print(f"Evaluation metrics saved to {filename}")

    return metrics

# Hyperparameters

In [22]:
embedding_dim = 64
hidden_dim = 256
dropout = 0.5
num_layers = 2
clip = 5
epochs = 12

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_items = df_reviews['item_idx'].max() + 1  # assumes 1-based indexing after .map()

# Initialize the model
model = PurchaseCandidateLSTM(
    num_items=num_items,
    embed_dim=embedding_dim,
    hidden_dim=hidden_dim,
    dropout=dropout,
    num_layers=num_layers
).to(device)

# Confirm model structure
print(model)

PurchaseCandidateLSTM(
  (embedding): Embedding(73915, 64, padding_idx=0)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (candidate_embedding): Embedding(73915, 64, padding_idx=0)
  (fc): Linear(in_features=320, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [23]:
train_lstm_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    epochs=epochs,
    lr=0.0005,
    clip=clip
)

Epoch 1/12
	Train Loss: 0.6932 | Val Loss: 0.6903
	Train Acc: 51.76% | Val Acc: 52.42%
Epoch 2/12
	Train Loss: 0.6853 | Val Loss: 0.6818
	Train Acc: 54.49% | Val Acc: 54.82%
Epoch 3/12
	Train Loss: 0.6657 | Val Loss: 0.6552
	Train Acc: 58.99% | Val Acc: 60.82%
Epoch 4/12
	Train Loss: 0.6213 | Val Loss: 0.6111
	Train Acc: 66.92% | Val Acc: 67.76%
Epoch 5/12
	Train Loss: 0.5543 | Val Loss: 0.5683
	Train Acc: 74.50% | Val Acc: 71.54%
Epoch 6/12
	Train Loss: 0.4795 | Val Loss: 0.5472
	Train Acc: 78.90% | Val Acc: 72.47%
Epoch 7/12
	Train Loss: 0.4120 | Val Loss: 0.5519
	Train Acc: 83.19% | Val Acc: 72.51%
Epoch 8/12
	Train Loss: 0.3612 | Val Loss: 0.5749
	Train Acc: 86.74% | Val Acc: 72.44%
Epoch 9/12
	Train Loss: 0.3285 | Val Loss: 0.6036
	Train Acc: 87.04% | Val Acc: 72.45%
Early stopping triggered after 9 epochs.


In [24]:
evaluation_results = evaluate_model(
    model=model,
    test_loader=val_loader,
    device=device,
    k=10,  # top-k
    set_name="val"
)

evaluation_results

Test Loss: 0.6036
Flat Precision: 0.7809 | Recall: 0.6241 | F1: 0.6938
Precision@10: 0.2960 | Recall@10: 0.9659 | F1@10: 0.6833 | NDCG@10: 0.9215
Evaluation metrics saved to val_evaluation_results.csv


{'val_acc': 0.724536554884399,
 'loss': 0.6035978040623903,
 'flat_precision': 0.7809486578055773,
 'flat_recall': 0.6241408039991668,
 'flat_f1': 0.6937948599212781,
 'precision@10': np.float32(0.29599863),
 'recall@10': np.float32(0.9658957),
 'f1@10': np.float64(0.6833342476117336),
 'ndcg@10': np.float64(0.9214865971415989)}

In [25]:
evaluation_results = evaluate_model(
    model=model,
    test_loader=test_loader,
    device=device,
    k=10,  # top-k
    set_name="test"
)

evaluation_results

Test Loss: 0.5847
Flat Precision: 0.3751 | Recall: 0.5922 | F1: 0.4593
Precision@10: 0.1620 | Recall@10: 0.9309 | F1@10: 0.2726 | NDCG@10: 0.7422
Evaluation metrics saved to test_evaluation_results.csv


{'val_acc': 0.7911852611857073,
 'loss': 0.5846705032549693,
 'flat_precision': 0.3750943396226415,
 'flat_recall': 0.5921954125707477,
 'flat_f1': 0.4592815062954834,
 'precision@10': np.float32(0.16196223),
 'recall@10': np.float32(0.9308762),
 'f1@10': np.float64(0.272615282583803),
 'ndcg@10': np.float64(0.7421897737023869)}