In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.utils import TrainTestSplitter, read_pickles, dl_data_pipeline
from src.models import ItemItemModel, BaseModelAverage
from src.metrics import ml_metrics, predictive_metrics, rank_metrics
from tqdm import tqdm
import xgboost as xgb
from sklearn.metrics import label_ranking_average_precision_score, ndcg_score

tqdm.pandas()

In [2]:
# read and prepare data

df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")
df_all = dl_data_pipeline(df_movies, df_users, df_ratings)

train_data, test_data = train_test_split(df_all.reset_index(drop=True), test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [3]:
# train test to DMatrix

alltrain = train_data.drop(columns=["UserID", "Rating"])
alltest = test_data.drop(columns=["UserID", "Rating"])

dtrain = xgb.DMatrix(data=alltrain, label=train_data['Rating'])
dtest = xgb.DMatrix(data=alltest, label=test_data['Rating']) # we will create new DMatrix for each group (users' lists)

# specify groups for training
train_groups = train_data.groupby('UserID').size().to_numpy()
dtrain.set_group(train_groups)

# Using XGBoost for Leaning to Rank

Here's why XGBoost is highly effective for this ranking task: dataset has sparse data, where many user-item interactions are missing –  XGBoost efficiently handles sparse data through its sparse-aware split finding algorithm, which can skip over missing values or assign them a default direction in tree splits, thereby optimizing computation and memory usage.

In the context of learning to rank using models like XGBoost, pairwise and NDCG (Normalized Discounted Cumulative Gain) represent two different types of ranking strategies. Here's a breakdown of the differences between the two:

### Pairwise Approach
The pairwise approach focuses on comparing pairs of items at a time during the training process. The fundamental idea is to minimize the number of inversions in ranking — that is, cases where a lower-ranked item (according to the model) should actually be ranked higher than a higher-ranked item (again, according to the model).
It's effective in scenarios where the goal is to maximize the accuracy of item comparisons rather than to achieve an accurate scoring of the items' ranks.

### NDCG Approach
NDCG is a listwise approach that evaluates the entire list of items at once. NDCG measures the gain of each item based on its position in the result list, giving higher importance to hits at higher ranks. This approach directly optimizes the model based on how well it ranks items in the order of their relevance, taking into account the position of items in the ranked list.
Thus, a model optimizing for NDCG tries to place the most relevant items at the top, where their contribution to the score is maximized.

In [9]:
# Training models (we will test both)

param_pairwise = {
    'objective': 'rank:pairwise',
    'learning_rate': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
}

bst_pairwise = xgb.train(param_pairwise, dtrain, num_boost_round=100)

In [10]:
param_ndcg = {
    'objective': 'rank:ndcg',
    'max_depth': 6,
    'eta': 0.1,
    'verbosity': 1
}

bst_ndcg = xgb.train(param_ndcg, dtrain, num_boost_round=100)

### Evaluate the ranking metrics

In [11]:
map_scores = []
ndcg_scores = []

user_ids = test_data['UserID'].unique()

for user_id in tqdm(user_ids):

    # filter
    group =  test_data[test_data['UserID'] == user_id]
    
    # real values
    actual = group['Rating'].values

    # prediction
    group_features = group.drop(['UserID', 'Rating'], axis=1)
    group_labels = group['Rating']
    dtest_group = xgb.DMatrix(data=group_features, label=group_labels)
    preds = bst_pairwise.predict(dtest_group)

    # calc metrics
    binary_actual = (actual >= 4).astype(int)
    map_score = label_ranking_average_precision_score([binary_actual], [preds.argsort()[::-1]])
    map_scores.append(map_score)

    if len(preds) > 1:
        ndcg_score_val = ndcg_score([binary_actual], [preds], k=len(actual))
        ndcg_scores.append(ndcg_score_val)

average_map = np.mean(map_scores)
average_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0.0  # handle cases where ndcg_scores might be empty

print("Pair-Wise:")
print(f"Mean Average Precision (MAP): {average_map}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {average_ndcg}")

100%|██████████| 6036/6036 [00:41<00:00, 143.91it/s]

Pair-Wise:
Mean Average Precision (MAP): 0.6784611491448017
Normalized Discounted Cumulative Gain (NDCG): 0.9068129363607733





In [12]:
map_scores = []
ndcg_scores = []

user_ids = test_data['UserID'].unique()
for user_id in tqdm(user_ids):
    # filter
    group =  test_data[test_data['UserID'] == user_id]
    # real values
    actual = group['Rating'].values

    # prediction
    group_features = group.drop(['UserID', 'Rating'], axis=1)
    group_labels = group['Rating']
    dtest_group = xgb.DMatrix(data=group_features, label=group_labels)
    preds = bst_ndcg.predict(dtest_group)

    # calc metrics
    binary_actual = (actual >= 4).astype(int)
    map_score = label_ranking_average_precision_score([binary_actual], [preds.argsort()[::-1]])
    map_scores.append(map_score)

    if len(preds) > 1:
        ndcg_score_val = ndcg_score([binary_actual], [preds], k=len(actual))
        ndcg_scores.append(ndcg_score_val)

average_map = np.mean(map_scores)
average_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0.0  # handle cases where ndcg_scores might be empty

print("NDCG:")
print(f"Mean Average Precision (MAP): {average_map}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {average_ndcg}")

100%|██████████| 6036/6036 [00:44<00:00, 134.70it/s]

NDCG:
Mean Average Precision (MAP): 0.6779910469828875
Normalized Discounted Cumulative Gain (NDCG): 0.9072433808627288





In [13]:
# save better model
import pickle
model_filename = '../../artifacts/bst_ndcg_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(bst_ndcg, file)
print(f"Model saved to {model_filename}")

Model saved to ../../artifacts/bst_ndcg_model.pkl


# Using Deep Learning Approach

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from  src.models import MovieLensDataset, RankingNetwork

df = df_all.copy()
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['UserID'])
df['movie_id'] = movie_encoder.fit_transform(df['MovieID'])
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


train_dataset = MovieLensDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

model = RankingNetwork(len(user_encoder.classes_), len(movie_encoder.classes_))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

def train(model, data_loader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for users, movies, ratings in data_loader:
            optimizer.zero_grad()
            outputs = model(users, movies).squeeze()
            loss = criterion(outputs, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')


In [7]:
train(model, train_loader, epochs=15)

user_id = 0
top_movies_df = model.predict_all_movies(user_id, num_top_movies=5)

print(top_movies_df)

Epoch 1, Loss: 1.3714783945223954
Epoch 2, Loss: 0.878379925808995
Epoch 3, Loss: 0.8360544551433239
Epoch 4, Loss: 0.8187176803709678
Epoch 5, Loss: 0.8077725702878839
Epoch 6, Loss: 0.7999195014522843
Epoch 7, Loss: 0.7927544346163842
Epoch 8, Loss: 0.7861292482719007
Epoch 9, Loss: 0.7798358838449895
Epoch 10, Loss: 0.77430417125071
Epoch 11, Loss: 0.7681412651458003
Epoch 12, Loss: 0.7624278237860858
Epoch 13, Loss: 0.7563189638019447
Epoch 14, Loss: 0.7504911535989758
Epoch 15, Loss: 0.7443604139052212
   MovieID  PredictedRating
0     1620         5.297952
1     2873         5.191571
2     2698         5.134268
3      744         5.133360
4     2816         5.100731


### Evaluation using already defined metrics + MAP + NDCG#

In [8]:
test_dataset = MovieLensDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=True)

test_metrics = model.evaluate(test_loader)
print(test_metrics)

{'mae': 0.727, 'rmse': 0.92, 'precision': 0.842, 'recall': 0.421, 'f1': 0.562, 'roc_auc': 0.657, 'map': 0.575549093276752, 'ndcg': 0.9815455938172669}
