In [2]:
import numpy as np
import pandas as pd

from src.utils import read_pickles
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import precision_score, recall_score


In [3]:
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")

In [4]:
def train_test_split(df, split_date):
    train = df[df["Date"] < split_date][["UserID", "MovieID", "Rating"]]
    test = df[df["Date"] >= split_date][["UserID", "MovieID", "Rating"]]
    return train, test

split_date = pd.to_datetime("2000-12-02").date()
train, test = train_test_split(df_ratings, split_date)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (797116, 3)
Test shape: (203093, 3)


# User-User

In [5]:
test = test[test["UserID"].isin(train["UserID"])]
test = test[test["MovieID"].isin(train["MovieID"])]
print(f"Test shape after deleting 'cold-start' users: {test.shape}")

Test shape after deleting 'cold-start' users: (105999, 3)


In [6]:
rating_matrix = train.pivot_table(index='UserID', columns='MovieID', \
                                  values='Rating', fill_value=0)
rating_matrix.head(3)

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
647,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
user_similarity = cosine_similarity(rating_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=rating_matrix.index,
                                  columns=rating_matrix.index)

In [8]:
top_n = 30
neighbors_dict = {}

for i in range(user_similarity_df.shape[0]):
    row = user_similarity_df.iloc[i]
    user = row.index[i]
    row = row[row.index != user]
    neighbors = list(np.argsort(row)[::-1][:top_n])
    neighbors_dict[user] = neighbors

In [9]:
X_test = test[["UserID", "MovieID"]]
y_test = test["Rating"]

predict = []
for _, row in X_test.iterrows():
    user = row["UserID"]
    movie = row["MovieID"]
    user_neighbors = neighbors_dict[user]
    filtered_df = rating_matrix[rating_matrix.index.isin(neighbors)]
    filtered_df = filtered_df.loc[:, movie]
    pred = filtered_df[filtered_df != 0].mean()
    predict.append(pred)

In [10]:
predict_scores = np.array(predict)
true_scores = np.array(y_test)

In [11]:

def apply_ml_metrics(true_scores, predict_scores, threshold=4):
    indexes =~np.isnan(predict_scores)
    true_scores = true_scores[indexes]
    predict_scores = predict_scores[indexes]

    true_scores_cat = (true_scores >= threshold).astype(int)
    predict_scores_cat = (predict_scores >= threshold).astype(int)

    mae = mean_absolute_error(true_scores, predict_scores)
    rmse = np.sqrt(mean_squared_error(true_scores, predict_scores))
    precision = precision_score(true_scores_cat, predict_scores_cat)
    recall = recall_score(true_scores_cat, predict_scores_cat)

    return {"mae": round(mae, 3),
            "rmse": round(rmse, 3),
            "precision": round(precision, 3),
            "recall": round(recall, 3)}

In [12]:
true_scores[~np.isnan(predict_scores)].shape

(85295,)

In [13]:
apply_ml_metrics(true_scores, predict_scores)

{'mae': 0.88, 'rmse': 1.151, 'precision': 0.648, 'recall': 0.663}