In [1]:
import pickle
import pandas as pd
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVDpp
import pickle

In [2]:
train_file = "../datasets/AOW-private/TRAIN_AOW_100negative"
test_file = "../datasets/AOW-private/test.txt"

In [8]:
saved_model_file = "models/svdpp_AOW_data_cleaned.pkl"

In [9]:
df_train_neg = pickle.load(open(train_file, "rb"))

In [10]:
df_train = df_train_neg[df_train_neg["labels"] == 1]

In [11]:
df_train.head(5)

Unnamed: 0,id_user,id_item,labels
0,1,241,1
1,1,1066,1
2,1,954,1
3,1,161,1
4,1,1503,1


In [12]:
df_test = pd.read_csv(test_file, sep="\t", names=[ "id_user", "id_item", "labels"], header=None)

In [13]:
df_test = df_test[df_test["labels"] == 1]

In [14]:
df_test.head(2)

Unnamed: 0,id_user,id_item,labels
0,1,375,1
1,1,412,1


In [15]:
df_test[df_test['labels'] != 1].size

0

## Building model

In [8]:
reader = Reader(rating_scale=(1, 1))

In [9]:
data = Dataset.load_from_df(df_train, reader)

In [10]:
svdpp = SVDpp()

In [12]:
svdpp_fitted = svdpp.fit(data.build_full_trainset())

### Careful to overwrite the model

In [14]:
pickle.dump(svdpp_fitted, open(saved_model_file, "wb"))

### Testing

In [4]:
saved_model = pickle.load(open(saved_model_file, 'rb'))

In [5]:
saved_model

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f89e459b048>

### Prepare test dataframe with unwatched movies for each user

In [6]:
users_non_watched_list = {}

In [16]:
full_interactions_df = pd.concat([df_train, df_test])

In [17]:
all_users = set(full_interactions_df['id_user'].unique())

In [18]:
all_items = set(full_interactions_df['id_item'].unique())

In [19]:
all_user_items = {}

In [29]:
for user, groupdf in full_interactions_df.groupby('id_user'):
    all_user_items[user] = set(groupdf['id_item'].values)

In [31]:
all_neg_users_items = {}

In [32]:
for user, user_items in all_user_items.items():
    all_neg_users_items[user] = all_items.difference(user_items)

In [None]:
all_neg_users_items_pairs = []
for user, items in all_neg_users_items.items():
    for item in items:
        all_neg_users_items_pairs.append( [user, item, 0] )

In [None]:
all_neg_users_items_df = pd.DataFrame(np.array(all_neg_users_items_pairs), columns=["id_user", "id_item", "labels"])

### Concat test and all negative dataframe

In [None]:
full_test_set = pd.concat([ test[["id_user", "id_item", "labels"]], all_neg_users_items_df])

### Predict for the negative dataset plus the test set

In [None]:
test_dataset = Dataset.load_from_df(full_test_set, Reader(rating_scale=(0, 5))).build_full_trainset()

In [None]:
test_dataset = Dataset.load_from_df(test[["id_user", "id_item", "labels"]],  Reader(rating_scale=(1, 5))).build_full_trainset()

In [None]:
predictions = svdpp.test(test_dataset.build_testset())

In [None]:
predictions[:5]

In [None]:
def get_user_top_k(user, predictions, n):
    user_preds = [pred for pred in predictions if pred.uid == user]
    return sorted(user_preds, key=lambda p: p.est, reverse=True)[:n]

In [None]:
def recall_prec_user_at_k(user, preds, n, est_thr=0.8, rating_thr=0):
    top_preds = get_user_top_k(user, preds, n)
    relevant_recommended = sum([ 1 for pred in top_preds if pred.est >= est_thr and pred.r_ui > rating_thr ])
    relevant_items = sum([ 1 for pred in preds if pred.r_ui >= rating_thr ])
    return {
        'prec': relevant_recommended / n,
        'recall': relevant_recommended / relevant_items,
        'relevant': relevant_items
    }

In [None]:
test_users = test['id_user'].unique()

In [None]:
users_predictions = dict([ (user, []) for user in test_users ])

In [None]:
for pred in predictions:
    users_predictions[pred.uid].append(pred)

In [None]:
metrics_k_5 = [ recall_prec_user_at_k(user, preds, 5) for user, preds in users_predictions.items() ]

In [None]:
metrics_k_10 = [ recall_prec_user_at_k(user, predictions, 10) for user, preds in users_predictions.items() ]

In [None]:
sum(list(map( lambda m: m['prec'], metrics_k_5 ))) / len(metrics_k_5)

In [None]:
sum(list(map( lambda m: m['recall'], metrics_k_5 ))) / len(metrics_k_5)