In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from src.utils import TrainTestSplitter, read_pickles, dl_data_pipeline
from src.models import ItemItemModel, BaseModelAverage
from src.metrics import ml_metrics, predictive_metrics, rank_metrics
from tqdm import tqdm
import xgboost as xgb

tqdm.pandas()

In [6]:
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")

In [7]:
df_movies.head()

Unnamed: 0,MovieID,Title,Genres,Year,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",1995,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,[Comedy],1995,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-Code,State,Latitude,Longitude
0,1,F,1,10,48067,MI,42.488735,-83.13752
1,2,M,56,16,70072,LA,29.869283,-90.10933
2,3,M,25,15,55117,MN,44.989065,-93.10666
3,4,M,45,7,2460,MA,42.352996,-71.20907
4,5,M,25,20,55455,MN,44.971965,-93.23588


In [9]:
df_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Datetime,Date
0,1,1193,5,978300760,2000-12-31 22:12:40,2000-12-31
1,1,661,3,978302109,2000-12-31 22:35:09,2000-12-31
2,1,914,3,978301968,2000-12-31 22:32:48,2000-12-31
3,1,3408,4,978300275,2000-12-31 22:04:35,2000-12-31
4,1,2355,5,978824291,2001-01-06 23:38:11,2001-01-06


In [10]:
df_all = dl_data_pipeline(df_movies, df_users, df_ratings)

# Using XGBoost for Leaning to Rank

XGBoost is considered one of the best tools for implementing learning to rank models due to several strengths it has particularly suited for ranking tasks, such as those encountered with the MovieLens dataset. Here's why XGBoost is highly effective for these tasks:

MovieLens and similar datasets often feature sparse data, where many user-item interactions are missing (i.e., most users have not rated most movies). XGBoost efficiently handles sparse data through its sparse-aware split finding algorithm, which can skip over missing values or assign them a default direction in tree splits, thereby optimizing computation and memory usage.



In [11]:
# Resetting index when splitting data
train_data, test_data = train_test_split(df_all.reset_index(drop=True), test_size=0.2, random_state=42)


In [12]:
user_rating_avg = train_data.groupby('UserID')['Rating'].mean().to_dict()
movie_rating_avg = train_data.groupby('MovieID')['Rating'].mean().to_dict()

# train_data['user_avg'] = train_data['UserID'].apply(lambda x: user_rating_avg.get(x, 0))
# train_data['movie_avg'] = train_data['MovieID'].apply(lambda x: movie_rating_avg.get(x, 0))
# train_data.head()
alltrain = train_data.drop(columns=["UserID", "Rating"])

In [13]:
user_rating_avg = test_data.groupby('UserID')['Rating'].mean().to_dict()
movie_rating_avg = test_data.groupby('UserID')['Rating'].mean().to_dict()

# test_data['user_avg'] = test_data['UserID'].apply(lambda x: user_rating_avg.get(x, 0))
# test_data['movie_avg'] = test_data['MovieID'].apply(lambda x: movie_rating_avg.get(x, 0))
alltest = test_data.drop(columns=["UserID", "Rating"])

In [20]:
dtrain = xgb.DMatrix(data=alltrain, label=train_data['Rating'])
dtest = xgb.DMatrix(data=alltest, label=test_data['Rating'])

In the context of learning to rank using models like XGBoost, pairwise and NDCG (Normalized Discounted Cumulative Gain) represent two different types of ranking strategies. Both approaches aim to optimize the order of items but do so using different methodologies and objectives. Here's a breakdown of the differences between the two:

### Pairwise Approach
*Concept*: The pairwise approach focuses on comparing pairs of items at a time during the training process. The fundamental idea is to minimize the number of inversions in ranking — that is, cases where a lower-ranked item (according to the model) should actually be ranked higher than a higher-ranked item (again, according to the model).

*Objective*: The model learns by comparing every pair of items within the same query or user session and attempts to correctly order each pair. The loss function typically penalizes the model more when it incorrectly orders a pair that is relatively close in the true order and less when the pair is far apart.

*Suitability*: This approach is useful when the relative order between items is more important than the actual rank positions or the magnitude of scores. It's effective in scenarios where the goal is to maximize the accuracy of item comparisons rather than to achieve an accurate scoring of the items' ranks.

### NDCG Approach
*Concept*: NDCG is a listwise approach that evaluates the entire list of items at once. NDCG measures the gain of each item based on its position in the result list, giving higher importance to hits at higher ranks. This approach directly optimizes the model based on how well it ranks items in the order of their relevance, taking into account the position of items in the ranked list.

*Objective*: The NDCG loss function is designed to maximize the gain from highly relevant items appearing at the top of the list. The gain is discounted at lower ranks, which reflects the reduced utility of items found lower in the list. Thus, a model optimizing for NDCG tries to place the most relevant items at the top, where their contribution to the score is maximized.

*Suitability*: NDCG is particularly effective in situations where the quality of the top-ranked results is much more important than the overall order of all items. This makes it highly relevant for search engines and recommendation systems where the top few results are critical for user satisfaction.

In [198]:
train_data.head()

Unnamed: 0,UserID,MovieID,Rating,AvgUserRating,AvgMovieRating,Gender_F,Gender_M,Age_1,Age_18,Age_25,...,Decade_1910,Decade_1920,Decade_1930,Decade_1940,Decade_1950,Decade_1960,Decade_1970,Decade_1980,Decade_1990,Decade_2000
416292,3283,780,4,3.955128,3.539604,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
683230,1272,2018,4,3.522388,3.733831,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2434,6036,3398,3,3.499232,,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
688533,1680,1608,2,3.711226,3.63807,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
472584,2941,1081,2,3.22619,3.7,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [199]:
param = {
   'objective': 'rank:pairwise',
    'learning_rate': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'n_estimators': 100
}

bst_pairwise = xgb.train(param, dtrain, num_boost_round=100)


Parameters: { "n_estimators" } are not used.



In [200]:
param = {
    'objective': 'rank:ndcg',
    'max_depth': 6,
    'eta': 0.1,
    'verbosity': 1
}

bst_ndcg = xgb.train(param, dtrain, num_boost_round=100)

In [201]:
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score, ndcg_score

grouped_test_data = test_data.groupby('UserID')

map_scores = []
ndcg_scores = []

for user_id, group in grouped_test_data:
    actual = group['Rating'].values
    preds = predictions_pairwise[group.index]
    
    binary_actual = (actual >= 4).astype(int)
    
    map_score = label_ranking_average_precision_score([binary_actual], [preds.argsort()[::-1]])
    map_scores.append(map_score)
    
    if len(preds) != 1:
        ndcg_score_val = ndcg_score([actual], [preds], k=len(actual))
        ndcg_scores.append(ndcg_score_val)

average_map = np.mean(map_scores)
average_ndcg = np.mean(ndcg_scores)

print(f"Mean Average Precision (MAP): {average_map}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {average_ndcg}")

RMSE Pairwise: 1.9334395149329682
RMSE NDCG: 2.3030017570154406




# Using Deep Learning Approach

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from  src.models import MovieLensDataset, RankingNetwork

df = df_all.copy()
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['UserID'])
df['movie_id'] = movie_encoder.fit_transform(df['MovieID'])
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


train_dataset = MovieLensDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

model = RankingNetwork(len(user_encoder.classes_), len(movie_encoder.classes_))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

def train(model, data_loader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for users, movies, ratings in data_loader:
            optimizer.zero_grad()
            outputs = model(users, movies).squeeze()
            loss = criterion(outputs, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')


In [15]:
train(model, train_loader, epochs=15)

user_id = 0
top_movies_df = model.predict_all_movies(user_id, num_top_movies=5)

print(top_movies_df)

Epoch 1, Loss: 1.5028314396729472
Epoch 2, Loss: 0.8855442481779243
Epoch 3, Loss: 0.8385911188824239
Epoch 4, Loss: 0.8197804151707099
Epoch 5, Loss: 0.8080766064496813
Epoch 6, Loss: 0.7993000290444168
Epoch 7, Loss: 0.7917930690508505
Epoch 8, Loss: 0.7849283513165558
Epoch 9, Loss: 0.7786643470012448
Epoch 10, Loss: 0.7725215507331599
Epoch 11, Loss: 0.7661493194843055
Epoch 12, Loss: 0.7604338166352197
Epoch 13, Loss: 0.7543875400408369
Epoch 14, Loss: 0.7480800722702451
Epoch 15, Loss: 0.7418567707579791
   MovieID  PredictedRating
0     2309         5.337345
1       51         5.325202
2     2158         5.250466
3     2722         5.185671
4      647         5.045784


## Evaluation using already defined metrics + MAP + NDCG#

In [16]:
test_dataset = MovieLensDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=True)

test_metrics = model.evaluate(test_loader)
print(test_metrics)

{'mae': np.float32(0.726), 'rmse': np.float32(0.92), 'precision': np.float64(0.837), 'recall': np.float64(0.427), 'f1': np.float64(0.566), 'roc_auc': np.float64(0.657), 'map': np.float64(0.5742667028048124), 'ndcg': np.float64(0.9815921307624851)}
