In [1]:
import numpy as np
import pandas as pd

from src.utils import read_pickles, dl_data_pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import label_ranking_average_precision_score, ndcg_score
from sklearn.model_selection import train_test_split

import pickle
import tqdm
import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Part 1: Train User-User filtering 

In [2]:
# read data
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")
train_ratings, _ = train_test_split(df_ratings, test_size=0.2, shuffle=False)
print(f"Train shape: {train_ratings.shape}")

Train shape: (800167, 6)


In [3]:
# training of user-user filtering

def get_close_users(train_ratings, n_neighbors=30):
    # compute average ratings per user
    users_avrg = train_ratings.groupby('UserID')['Rating'].mean().to_dict()
    
    # create rating matrix
    rating_matrix = train_ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

    # calculate similarity 
    user_similarity = cosine_similarity(rating_matrix.fillna(0))
    user_similarity_df = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

    # find N n_neighbors for each user
    neighbors_dict = {}
    for i in range(user_similarity_df.shape[0]):
        row = user_similarity_df.iloc[i]
        user = row.index[i]
        row = row[row.index != user]
        neighbors = list(np.argsort(row)[::-1][:n_neighbors])
        neighbors_dict[user] = neighbors
    
    return rating_matrix, users_avrg, neighbors_dict

In [4]:
rating_matrix, users_avrg, neighbors_dict = get_close_users(train_ratings, n_neighbors=10)

# identify all movie IDs that need to be added to the rating matrix, and fill them with None
new_movie_ids = [movie_id for movie_id in df_ratings['MovieID'].unique() if movie_id not in rating_matrix.columns]
new_columns = pd.DataFrame(None, index=rating_matrix.index, columns=new_movie_ids)
rating_matrix = pd.concat([rating_matrix, new_columns], axis=1)

print("Users neighbors:")
print(list(neighbors_dict.items())[:5])
print("Users average:")
print(list(users_avrg.items())[:5])
print("Rating matrix:")
rating_matrix.head(3)

Users neighbors:
[(1, [1479, 1281, 1357, 1474, 539, 1856, 2175, 4716, 1848, 679]), (2, [3106, 93, 2812, 4599, 2301, 298, 3993, 3359, 4784, 556]), (3, [2998, 477, 3498, 1902, 4318, 2260, 2433, 309, 4058, 801]), (4, [4141, 1573, 560, 85, 3664, 2345, 3459, 1347, 1576, 4000]), (5, [1482, 4605, 223, 2916, 1405, 3536, 279, 3052, 750, 1253])]
Users average:
[(1, 4.188679245283019), (2, 3.7131782945736433), (3, 3.9019607843137254), (4, 4.190476190476191), (5, 3.1464646464646466)]
Rating matrix:


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,127,3382,1843,286,3530,2198,2703,2845,3607,2909
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


### Part 2: Use User-User filtering (for choosing top % movies) + XGBoost for ranking

In [5]:
# prepare complete dataset for XGBoost
df_all = dl_data_pipeline(df_movies, df_users, df_ratings)
train_data, test_data = train_test_split(df_all, test_size=0.2, shuffle=False)

with open('../../artifacts/bst_ndcg_model.pkl', 'rb') as file:
    bst_pairwise = pickle.load(file)

In [6]:
map_scores = []
ndcg_scores = []

test_user_ids = test_data['UserID'].unique()
test_movie_ids = test_data['UserID'].unique()

for user_id in tqdm.tqdm(test_user_ids):
    user_dataset =  test_data[test_data['UserID'] == user_id]

    if user_id in neighbors_dict:
        neighbors_ids = neighbors_dict[user_id]
        user_avrg = users_avrg[user_id]
    else:
        # if users is not in train: all users are neighbors
        neighbors_ids = list(neighbors_dict.keys())
        # if users is not in train: fillnan with dataset average
        user_avrg = np.mean(list(users_avrg.values()))
    
    # FIRST STAGE
    movie_ids = user_dataset['MovieID'].to_list()
    first_stage_rating = rating_matrix.loc[neighbors_ids, movie_ids].mean()
    first_stage_rating = first_stage_rating.fillna(user_avrg).values.flatten()

    # INTERMIDIATE STAGE
    # choose 50% of movies, filter inputs to second stage
    sorted_indices = np.argsort(first_stage_rating)
    top_indices = sorted_indices[int(len(sorted_indices) * 0.5):]
    features = user_dataset.drop(['UserID', 'Rating'], axis=1).iloc[top_indices]
    labels = user_dataset['Rating'].iloc[top_indices]

    # SECOND STAGE
    dtest_group = xgb.DMatrix(data=features, label=labels)
    preds = bst_pairwise.predict(dtest_group)

    # METRICS
    binary_actual = (labels >= 4).astype(int)
    map_score = label_ranking_average_precision_score([binary_actual], [preds.argsort()[::-1]])
    map_scores.append(map_score)

    if len(preds) > 1:
        ndcg_score_val = ndcg_score([binary_actual], [preds], k=len(labels))
        ndcg_scores.append(ndcg_score_val)

average_map = np.mean(map_scores)
average_ndcg = np.mean(ndcg_scores)

print(f"Mean Average Precision (MAP): {average_map}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {average_ndcg}")

  0%|          | 0/1783 [00:00<?, ?it/s]

100%|██████████| 1783/1783 [00:34<00:00, 51.62it/s]

Mean Average Precision (MAP): 0.7124543358900729
Normalized Discounted Cumulative Gain (NDCG): 0.9147078190581672





### Creating a Two-Stage Class 

In [7]:
class TwoStagePipeline:
    def __init__(self):
        self.neighbors_dict = {}
        self.users_avrg = {}
        self.second_stage_model = None
        self.rating_matrix = None

    def init_first_stage(self, train_ratings, n_neighbors=30):
        self.users_avrg = train_ratings.groupby('UserID')['Rating'].mean().to_dict()
        self.rating_matrix = train_ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

        user_similarity = cosine_similarity(self.rating_matrix.fillna(0))
        user_similarity_df = pd.DataFrame(user_similarity, index=self.rating_matrix.index, columns=self.rating_matrix.index)

        # find N neighbors for each user
        for i in range(user_similarity_df.shape[0]):
            row = user_similarity_df.iloc[i]
            user = row.index[i]
            row = row[row.index != user]
            neighbors = np.array((np.argsort(row)[::-1][:n_neighbors]))+1
            self.neighbors_dict[user] = list(neighbors)

    def update_first_stage(self, movie_ids):
        new_movie_ids = [movie_id for movie_id in movie_ids if movie_id not in self.rating_matrix.columns]
        new_columns = pd.DataFrame(None, index=self.rating_matrix.index, columns=new_movie_ids)
        self.rating_matrix = pd.concat([self.rating_matrix, new_columns], axis=1)

    def init_second_stage(self, model_path):
        with open(model_path, 'rb') as file:
            self.second_stage_model = pickle.load(file)

    def one_user_rank(self, user_id, user_dataset, filter_proportion):
        movie_ids = user_dataset['MovieID'].to_list()

        # 1 use first stage
        local_user_ids = self.neighbors_dict.get(user_id, list(self.neighbors_dict.keys()))
        first_stage_rating = self.rating_matrix.loc[local_user_ids, movie_ids].mean()
        first_stage_rating = first_stage_rating.fillna(np.mean(list(self.users_avrg.values()))).values.flatten()

        # 2 selecting top 50%
        sorted_indices = np.argsort(first_stage_rating)
        top_indices = sorted_indices[int(len(sorted_indices) * filter_proportion):]
        features = user_dataset.drop(['UserID', 'Rating'], axis=1).iloc[top_indices]
        labels = user_dataset['Rating'].iloc[top_indices]
        
        # 3 use second stage
        dtest_group = xgb.DMatrix(data=features, label=labels)
        preds = self.second_stage_model.predict(dtest_group)

        return labels, preds

    def calc_metrics(self, labels, preds, binary_margin):
        binary_actual = (labels >= binary_margin).astype(int)
        map_score = label_ranking_average_precision_score([binary_actual], [preds.argsort()[::-1]])
        ndcg_score_ = ndcg_score([binary_actual], [preds], k=len(labels)) if len(preds) > 1 else None
        return map_score, ndcg_score_
    
    def run_evaluation(self, test_data, filter_proportion=0.5, binary_margin=4):
        map_scores, ndcg_scores = [], []
        for user_id in tqdm.tqdm(test_data['UserID'].unique()):

            # choose one user ratings
            user_dataset = test_data[test_data['UserID'] == user_id]
            
            # go through pipelune
            labels, preds = self.one_user_rank(user_id, user_dataset, filter_proportion)
            map_score, ndcg_score_ = self.calc_metrics(labels, preds, binary_margin)

            # update scores
            map_scores.append(map_score)
            if ndcg_score_ != None:
                ndcg_scores.append(ndcg_score_)
        
        self.average_map = np.mean(map_scores)
        self.average_ndcg = np.mean(ndcg_scores)
        print(f"Mean Average Precision (MAP): {self.average_map}")
        print(f"Normalized Discounted Cumulative Gain (NDCG): {self.average_ndcg}")

In [8]:
# prepare data
train_ratings, _ = train_test_split(df_ratings, test_size=0.2, shuffle=False)
train_data, test_data = train_test_split(df_all, test_size=0.2, shuffle=False)
movie_ids = df_ratings['MovieID'].unique()

# init pipeline
pipeline = TwoStagePipeline()

# configure first and second stage and update for all possible movies IDs
pipeline.init_first_stage(train_ratings, n_neighbors=30)
pipeline.update_first_stage(movie_ids)
print(f"Shape of rating_matrix (Users*Movies): {pipeline.rating_matrix.shape}")

# configure second stage
pipeline.init_second_stage("../../artifacts/bst_ndcg_model.pkl")
print(f"Class of second stage model: {type(pipeline.second_stage_model)}")

Shape of rating_matrix (Users*Movies): (4795, 3706)
Class of second stage model: <class 'xgboost.core.Booster'>


In [9]:
# run evaluation (with n_neighbors=30)
pipeline.run_evaluation(test_data, filter_proportion=0.5, binary_margin=4)

100%|██████████| 1783/1783 [00:37<00:00, 47.90it/s]

Mean Average Precision (MAP): 0.7397744369401572
Normalized Discounted Cumulative Gain (NDCG): 0.919433240979977





In [10]:
# run evaluation (with n_neighbors=50)
pipeline.init_first_stage(train_ratings, n_neighbors=50)
pipeline.update_first_stage(movie_ids)
pipeline.run_evaluation(test_data, filter_proportion=0.5, binary_margin=4)

100%|██████████| 1783/1783 [00:35<00:00, 50.20it/s]

Mean Average Precision (MAP): 0.7424674780520268
Normalized Discounted Cumulative Gain (NDCG): 0.9197015400708247





More n_neighbors gives better ranking results (with such manner we avoid None data), however increase models complexity.