In [24]:
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import GridSearch
import pandas as pd
from surprise import Reader
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from multiprocessing import Pool
from sklearn.externals import joblib
import pickle
import matplotlib.pyplot as plt
import numpy as np

In [183]:
train_file = "../datasets/movielens/ml-100k/ua.base"
test_file = "../datasets/movielens/ml-100k/ua.test"

In [184]:
col_names = [ "user", "movie", "rating", "timestamp" ]

In [185]:
train = pd.read_csv(train_file, sep="\t", header=None, names=col_names)
train['rating'] = 1

In [186]:
test = pd.read_csv(test_file, sep="\t", header=None, names=col_names)

# Add negatives to train

In [187]:
items = set(train['movie'].unique())

In [188]:
users =set(train['user'].unique())

In [189]:
users_items = {}

In [190]:
for user in users:
    users_items[user] = train[(train['user'] == user)]['movie'].unique()

In [191]:
neg_users_items = {}

In [192]:
for user in users:
    neg_num = len(users_items[user]) * 0.4 # 40% Of seen movies by the user
    user_items = set(users_items[user])
    neg_user_items = items.difference(user_items)
    neg_users_items[user] = [ item for item, index  in zip(neg_user_items, range(len(neg_user_items))) if index < neg_num ]
        

In [193]:
df_neg = pd.DataFrame(np.array([ [user, movie, 0] for user in users for movie in neg_users_items[user]]), columns=["user", 'movie', 'rating'])

In [194]:
df_full = pd.concat([train, df_neg])

In [195]:
df_full.head(2)

Unnamed: 0,movie,rating,timestamp,user
0,1,1,874965758.0,1
1,2,1,876893171.0,1


In [196]:
df_full[df_full['rating'] == 0].head(2)

Unnamed: 0,movie,rating,timestamp,user
0,20,0,,1
1,33,0,,1


# Build model


In [197]:
reader = Reader(rating_scale=(0, 1))

In [198]:
train_dataset = Dataset.load_from_df(df_full[["user", "movie", "rating"]], reader).build_full_trainset()

In [199]:
svdpp = SVDpp()

In [200]:
svdpp.fit(train_dataset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f54c2ef7128>

# Test model

### Prepare test dataframe with unwatched movies for each user

In [259]:
users_non_watched_list = {}

In [260]:
full_interactions_df = pd.concat([train, test])

In [267]:
all_users = set(full_interactions_df['user'].unique())

In [268]:
all_items = set(full_interactions_df['movie'].unique())

In [269]:
all_user_items = {}

In [270]:
for user in all_users:
    df = full_interactions_df
    all_user_items[user] = df[(df['user'] == user)]['movie'].unique()

In [271]:
all_neg_users_items = {}

In [272]:
for user in all_users:
    user_items = set(all_user_items[user])
    neg_user_items = all_items.difference(user_items)
    all_neg_users_items[user] = [ item for item, index  in zip(neg_user_items, range(len(neg_user_items))) ]


In [274]:
all_neg_users_items_df = [ [user, item, 0] for user, items in all_neg_users_items.items() for item in items ]

In [275]:
all_neg_users_items_df = pd.DataFrame(np.array(all_neg_users_items_df), columns=["user", "movie", "rating"])

### Concat test and all negative dataframe

In [276]:
full_test_set = pd.concat([ test[["user", "movie", "rating"]], all_neg_users_items_df])

### Predict for the negative dataset plus the test set

In [277]:
test_dataset = Dataset.load_from_df(full_test_set, Reader(rating_scale=(0, 5))).build_full_trainset()

In [278]:
predictions = svdpp.test(test_dataset.build_testset())

In [279]:
predictions[:5]

[Prediction(uid=1, iid=20, r_ui=4.0, est=0, details={'was_impossible': False}),
 Prediction(uid=1, iid=33, r_ui=4.0, est=0.48956068152687826, details={'was_impossible': False}),
 Prediction(uid=1, iid=61, r_ui=4.0, est=0.12695603061208427, details={'was_impossible': False}),
 Prediction(uid=1, iid=117, r_ui=3.0, est=0.7691083991951604, details={'was_impossible': False}),
 Prediction(uid=1, iid=155, r_ui=2.0, est=0.42592206992766424, details={'was_impossible': False})]

In [280]:
def get_user_top_k(user, predictions, n):
    user_preds = [pred for pred in predictions if pred.uid == user]
    return sorted(user_preds, key=lambda p: p.est, reverse=True)[:n]

In [297]:
def recall_prec_user_at_k(user, preds, n, est_thr=0.8, rating_thr=1):
    top_preds = get_user_top_k(user, preds, n)
    relevant_recommended = sum([ 1 for pred in top_preds if pred.est >= est_thr and pred.r_ui > 3 ])
    relevant_items = sum([ 1 for pred in preds if pred.r_ui >= rating_thr ])
    return {
        'prec': relevant_recommended / n,
        'recall': relevant_recommended / relevant_items,
        'relevant': relevant_items
    }

In [298]:
test_users = test['user'].unique()

In [299]:
users_predictions = dict([ (user, []) for user in test_users ])

In [300]:
for pred in predictions:
    users_predictions[pred.uid].append(pred)

In [301]:
metrics_k_5 = [ recall_prec_user_at_k(user, preds, 5) for user, preds in users_predictions.items() ]

In [302]:
metrics_k_10 = [ recall_prec_user_at_k(user, predictions, 10) for user, preds in users_predictions.items() ]

In [303]:
sum(list(map( lambda m: m['prec'], metrics_k_5 ))) / len(metrics_k_5)

0.11092258748674469

In [304]:
sum(list(map( lambda m: m['recall'], metrics_k_5 ))) / len(metrics_k_5)

0.055461293743372345

In [305]:
sum(list(map( lambda m: m['prec'], metrics_k_5 ))) / len(metrics_k_10)

0.11092258748674469

In [306]:
sum(list(map( lambda m: m['recall'], metrics_k_5 ))) / len(metrics_k_10)

0.055461293743372345