In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv('../data/interim/u.data', index_col=0)

In [3]:
train, test = train_test_split(ratings.values, test_size=0.2, random_state=0, stratify=ratings.user)
train_df = pd.DataFrame(train, columns=ratings.columns)
test_df = pd.DataFrame(test, columns=ratings.columns)

In [4]:
train_df

Unnamed: 0,user,item,rating,timestamp
0,308,186,4,887738152
1,773,790,3,888539825
2,279,1500,5,875306613
3,805,147,5,881694286
4,543,529,4,874866208
...,...,...,...,...
79987,863,346,5,889288911
79988,184,56,3,889908657
79989,6,274,4,883602501
79990,711,496,5,879993073


In [5]:
train_matrix = train_df.pivot_table(index='user', columns='item', values='rating').fillna(0)

similarity = cosine_similarity(train_matrix)

In [6]:
def get_similar_users(user_id):
    similar_users =  similarity[user_id - 1]
    similar_users = list(enumerate(similar_users))
    sorted_users = sorted(similar_users, key=lambda x:x[1], reverse=True)[1:6]
    return np.array([i[0] for i in sorted_users])

def recommend(user_id):

    similar_users = get_similar_users(user_id) + 1 

    user_profile = train_matrix.loc[user_id]
    watched_movies = user_profile[user_profile > 0].index

    totals = train_matrix.loc[similar_users].sum(axis=0)
    amounts = train_matrix.loc[similar_users].count(axis=0).astype(float)
    averages = totals / amounts

    averages = averages.drop(watched_movies)

    averages = averages.sort_values(ascending=False)

    return averages.head(10).index.values
    

In [7]:
recommendations = {}

for row in test_df.itertuples():
    recommendations[row.user] = recommend(row.user)

In [8]:
def precision_at_k(actual, predicted, k=10):

    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / min(k, len(pred_set))
    return result

def recall_at_k(actual, predicted, k=10):

    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / len(act_set)
    return result


def ndcg_at_k(actual, predicted, k=10):
    dcg = ndcg_score([actual[:k]], [predicted[:k]])
    idcg = ndcg_score([actual], [actual])
    result = dcg / idcg
    return result


def evaluate(test, recommendations):

    metrics = []

    for row in test.itertuples():

        user_id = row.user
        actual = test.query(f'user == {user_id}').item.values
        predicted = recommendations[user_id]
        if actual.shape[0] >= 5: 
            p5 = precision_at_k(actual, predicted, k=5)
            r5 = recall_at_k(actual, predicted, k=5)
            ndcg5 = ndcg_at_k(actual, predicted, k=5)
    
            metrics.append([p5, r5, ndcg5])

    metrics = pd.DataFrame(metrics, columns=['P@5', 'R@5', 'NDCG@5'])

    return metrics.mean()

result_metrics = evaluate(test_df, recommendations)

In [9]:
result_metrics

P@5       0.456083
R@5       0.072924
NDCG@5    0.819691
dtype: float64