In [1]:
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import GridSearch
import pandas as pd
from surprise import Reader
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from multiprocessing import Pool
from sklearn.externals import joblib
import pickle
import matplotlib.pyplot as plt
import numpy as np

In [2]:
train_file = "../datasets/movielens/ml-100k/ua.base"
test_file = "../datasets/movielens/ml-100k/ua.test"

In [3]:
col_names = [ "user", "movie", "rating", "timestamp" ]

In [4]:
train = pd.read_csv(train_file, sep="\t", header=None, names=col_names)
train['rating'] = 1

In [5]:
test = pd.read_csv(test_file, sep="\t", header=None, names=col_names)


# Add negatives to train

In [6]:
items = set(train['movie'].unique())

In [7]:
users =set(train['user'].unique())

In [8]:
users_items = {}

In [9]:
for user in users:
    users_items[user] = train[(train['user'] == user)]['movie'].unique()

In [10]:
neg_users_items = {}

In [11]:
for user in users:
    neg_num = len(users_items[user])# 1 neg for every movie seen
    user_items = set(users_items[user])
    neg_user_items = items.difference(user_items)
    neg_users_items[user] = [ item for item, index  in zip(neg_user_items, range(len(neg_user_items))) if index < neg_num ]
        

In [12]:
df_neg = pd.DataFrame(np.array([ [user, movie, 0] for user in users for movie in neg_users_items[user]]), columns=["user", 'movie', 'rating'])

In [13]:
df_full = pd.concat([train, df_neg])

In [14]:
df_full.head(2)

Unnamed: 0,movie,rating,timestamp,user
0,1,1,874965758.0,1
1,2,1,876893171.0,1


In [15]:
df_full[df_full['rating'] == 0].head(2)

Unnamed: 0,movie,rating,timestamp,user
0,20,0,,1
1,33,0,,1


# Test Preparation

### Prepare test dataframe with unwatched movies for each user

In [16]:
users_non_watched_list = {}

In [17]:
full_interactions_df = pd.concat([train, test])

In [18]:
all_users = set(full_interactions_df['user'].unique())

In [19]:
all_items = set(full_interactions_df['movie'].unique())

In [20]:
all_user_items = {}

In [21]:
for user in all_users:
    df = full_interactions_df
    all_user_items[user] = df[(df['user'] == user)]['movie'].unique()

In [22]:
all_neg_users_items = {}

In [23]:
for user in all_users:
    user_items = set(all_user_items[user])
    neg_user_items = all_items.difference(user_items)
    all_neg_users_items[user] = [ item for item, index  in zip(neg_user_items, range(len(neg_user_items))) ]


In [24]:
all_neg_users_items_df = [ [user, item, 0] for user, items in all_neg_users_items.items() for item in items ]

In [25]:
all_neg_users_items_df = pd.DataFrame(np.array(all_neg_users_items_df), columns=["user", "movie", "rating"])

### Concat test and all negative dataframe

In [26]:
full_test_set = pd.concat([ test[["user", "movie", "rating"]], all_neg_users_items_df])


# Build models for hyperparams

In [30]:
factors = [40, 60, 80]
epochs = [ 30, 40]
lr_all = [0.5, 0.05]

In [31]:
reader = Reader(rating_scale=(0, 1))

In [32]:
train_dataset = Dataset.load_from_df(df_full[["user", "movie", "rating"]], reader).build_full_trainset()

In [35]:
factor_models = [ SVDpp(n_factors=n).fit(train_dataset) for n in factors ]

In [None]:
lr_models = [ SVDpp(lr_all=n).fit(train_dataset) for n in lr_all ]

### Predict for the negative dataset plus the test set

In [39]:
test_dataset = Dataset.load_from_df(full_test_set, Reader(rating_scale=(0, 5))).build_full_trainset()

In [40]:
test_built = test_dataset.build_testset()

In [32]:
predictions = svdpp.test(test_dataset.build_testset())

In [60]:
sum([ 1 for pred in predictions if pred.uid == 1])

1420

In [37]:
def get_user_top_k(user, predictions, n):
    user_preds = [pred for pred in predictions if pred.uid == user]
    return sorted(user_preds, key=lambda p: p.est, reverse=True)[:n]

In [35]:
get_user_top_k(1, predictions, 20)

[Prediction(uid=1, iid=202, r_ui=5.0, est=0.6648243571671495, details={'was_impossible': False}),
 Prediction(uid=1, iid=117, r_ui=3.0, est=0.653949742281041, details={'was_impossible': False}),
 Prediction(uid=1, iid=265, r_ui=4.0, est=0.5096672609515405, details={'was_impossible': False}),
 Prediction(uid=1, iid=318, r_ui=0.0, est=0.45854859301776063, details={'was_impossible': False}),
 Prediction(uid=1, iid=273, r_ui=0.0, est=0.43992448623791525, details={'was_impossible': False}),
 Prediction(uid=1, iid=367, r_ui=0.0, est=0.43672348905696623, details={'was_impossible': False}),
 Prediction(uid=1, iid=423, r_ui=0.0, est=0.4365136034032773, details={'was_impossible': False}),
 Prediction(uid=1, iid=739, r_ui=0.0, est=0.4226789151022645, details={'was_impossible': False}),
 Prediction(uid=1, iid=385, r_ui=0.0, est=0.42195475370497193, details={'was_impossible': False}),
 Prediction(uid=1, iid=944, r_ui=0.0, est=0.42123584009291837, details={'was_impossible': False}),
 Prediction(uid=

In [38]:
def recall_prec_user_at_k(user, preds, n, est_thr=0, rating_thr=0):
    top_preds = get_user_top_k(user, preds, n)
    relevant_recommended = sum([ 1 for pred in top_preds if pred.r_ui > 0])
    return {
        'prec': relevant_recommended / n,
        'recall': relevant_recommended / n,
    }

In [51]:
test_users = test['user'].unique()

In [52]:
users_predictions = dict([ (user, []) for user in test_users ])

In [53]:
for pred in predictions:
    users_predictions[pred.uid].append(pred)

In [54]:
metrics_k_5 = [ recall_prec_user_at_k(user, preds, 5) for user, preds in users_predictions.items() ]

In [55]:
metrics_k_10 = [ recall_prec_user_at_k(user, preds, 10) for user, preds in users_predictions.items() ]

In [61]:
sum(list(map( lambda m: m['prec'], metrics_k_5 )))

73.80000000000018

In [62]:
sum(list(map( lambda m: m['recall'], metrics_k_5 )))

73.80000000000018

In [63]:
sum(list(map( lambda m: m['prec'], metrics_k_10 )))

39.700000000000124

In [64]:
sum(list(map( lambda m: m['recall'], metrics_k_10 )))

39.700000000000124

In [41]:
for model in factor_models:
    users_predictions = dict([ (user, []) for user in test_users ])
    for pred in model.predict(test_built):
        users_predictions[pred.uid].append(pred)
        [ recall_prec_user_at_k(user, preds, 10) for user, preds in users_predictions.items() ]

NameError: name 'test_users' is not defined