In [87]:
import numpy as np
import pandas as pd

from src.utils import read_pickles, dl_data_pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import label_ranking_average_precision_score, ndcg_score
from sklearn.model_selection import train_test_split

import pickle
import tqdm
import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Part 1: Train User-User filtering 

In [2]:
# read data
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")
train_ratings, _ = train_test_split(df_ratings, test_size=0.2, shuffle=False)
print(f"Train shape: {train_ratings.shape}")

Train shape: (800167, 6)


In [81]:
# training of user-user filtering

def get_close_users(train_ratings, n_neighbors=10):
    # compute average ratings per user
    users_avrg = train_ratings.groupby('UserID')['Rating'].mean().to_dict()
    
    # create rating matrix
    rating_matrix = train_ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

    # calculate similarity 
    user_similarity = cosine_similarity(rating_matrix.fillna(0))
    user_similarity_df = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

    # find N n_neighbors for each user
    neighbors_dict = {}
    for i in range(user_similarity_df.shape[0]):
        row = user_similarity_df.iloc[i]
        user = row.index[i]
        row = row[row.index != user]
        neighbors = list(np.argsort(row)[::-1][:n_neighbors])
        neighbors_dict[user] = neighbors
    
    return rating_matrix, users_avrg, neighbors_dict

In [83]:
rating_matrix, users_avrg, neighbors_dict = get_close_users(train_ratings, n_neighbors=10)

# identify all movie IDs that need to be added to the rating matrix, and fill them with None
new_movie_ids = [movie_id for movie_id in df_ratings['MovieID'].unique() if movie_id not in rating_matrix.columns]
new_columns = pd.DataFrame(None, index=rating_matrix.index, columns=new_movie_ids)
rating_matrix = pd.concat([rating_matrix, new_columns], axis=1)

print("Users neighbors:")
print(neighbors_dict)
print("Users average:")
print(users_avrg)
print("Rating matrix:")
rating_matrix.head(3)

Users neighbors:
{1: [1479, 1281, 1357, 1474, 539, 1856, 2175, 4716, 1848, 679], 2: [3106, 93, 2812, 4599, 2301, 298, 3993, 3359, 4784, 556], 3: [2998, 477, 3498, 1902, 4318, 2260, 2433, 309, 4058, 801], 4: [4141, 1573, 560, 85, 3664, 2345, 3459, 1347, 1576, 4000], 5: [1482, 4605, 223, 2916, 1405, 3536, 279, 3052, 750, 1253], 6: [1090, 35, 3023, 1758, 897, 2871, 834, 3361, 4783, 795], 7: [1847, 681, 3966, 817, 4567, 3551, 1760, 1466, 1638, 2260], 8: [366, 4053, 2690, 2259, 2935, 3448, 4450, 1201, 1661, 185], 9: [1771, 74, 772, 1831, 3448, 895, 720, 2935, 1839, 2968], 10: [1118, 2527, 1420, 3399, 4046, 4385, 262, 46, 3270, 3027], 11: [3278, 3555, 3063, 2206, 3460, 3969, 3055, 384, 674, 3511], 12: [3447, 2343, 118, 4755, 295, 343, 825, 3179, 3286, 3367], 13: [3553, 3710, 1919, 4318, 2917, 3412, 1687, 3949, 851, 3994], 14: [596, 3635, 1767, 2312, 942, 1807, 2041, 898, 4230, 37], 15: [4215, 744, 3623, 2557, 772, 1084, 3439, 3123, 347, 816], 16: [4242, 4264, 2631, 777, 2506, 1677, 2947, 869

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,127,3382,1843,286,3530,2198,2703,2845,3607,2909
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


### Part 2: Use User-User filtering (for choosing top % movies) + XGBoost for ranking

In [86]:
# prepare complete dataset for XGBoost
df_all = dl_data_pipeline(df_movies, df_users, df_ratings)
train_data, test_data = train_test_split(df_all, test_size=0.2, shuffle=False)

with open('../../artifacts/bst_ndcg_model.pkl', 'rb') as file:
    bst_pairwise = pickle.load(file)

In [89]:
map_scores = []
ndcg_scores = []

test_user_ids = test_data['UserID'].unique()
test_movie_ids = test_data['UserID'].unique()

for user_id in tqdm.tqdm(test_user_ids):
    user_dataset =  test_data[test_data['UserID'] == user_id]

    if user_id in neighbors_dict:
        neighbors_ids = neighbors_dict[user_id]
        user_avrg = users_avrg[user_id]
    else:
        # if users is not in train: all users are neighbors
        neighbors_ids = list(neighbors_dict.keys())
        # if users is not in train: fillnan with dataset average
        user_avrg = np.mean(list(users_avrg.values()))
    
    # FIRST STAGE
    movie_ids = user_dataset['MovieID'].to_list()
    first_stage_rating = rating_matrix.loc[neighbors_ids, movie_ids].mean()
    first_stage_rating = first_stage_rating.fillna(user_avrg).values.flatten()

    # INTERMIDIATE STAGE
    # choose 50% of movies, filter inputs to second stage
    sorted_indices = np.argsort(first_stage_rating)
    top_indices = sorted_indices[int(len(sorted_indices) * 0.5):]
    features = user_dataset.drop(['UserID', 'Rating'], axis=1).iloc[top_indices]
    labels = user_dataset['Rating'].iloc[top_indices]

    # SECOND STAGE
    dtest_group = xgb.DMatrix(data=features, label=labels)
    preds = bst_pairwise.predict(dtest_group)

    # METRICS
    binary_actual = (labels >= 4).astype(int)
    map_score = label_ranking_average_precision_score([binary_actual], [preds.argsort()[::-1]])
    map_scores.append(map_score)

    if len(preds) > 1:
        ndcg_score_val = ndcg_score([binary_actual], [preds], k=len(labels))
        ndcg_scores.append(ndcg_score_val)

average_map = np.mean(map_scores)
average_ndcg = np.mean(ndcg_scores)

print(f"Mean Average Precision (MAP): {average_map}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {average_ndcg}")

100%|██████████| 1783/1783 [00:13<00:00, 136.52it/s]

Mean Average Precision (MAP): 0.6704120059662365
Normalized Discounted Cumulative Gain (NDCG): 0.9142448076944919



