In [45]:
import logging
import numpy as np
import pandas as pd
import papermill as pm

from reco_utils.common.timer import Timer
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.recommender.sar import SAR


In [46]:
# top k items to recommend
TOP_K = 10

In [47]:
data= pd.read_csv(r'C:\Users\Sheetal Sekhar\Fakedata_5000.csv')

In [48]:
data.head()

Unnamed: 0.1,Unnamed: 0,userID,itemID,productRating,timestamp
0,0,3212,3985,0.3,979462903
1,1,2244,3820,7.4,379440791
2,2,1863,4506,7.2,796061032
3,3,3024,571,9.0,246813744
4,4,2793,2048,2.1,814789007


In [49]:
data['productRating'] = data['productRating'].astype(np.float32)

data.head()

Unnamed: 0.1,Unnamed: 0,userID,itemID,productRating,timestamp
0,0,3212,3985,0.3,979462903
1,1,2244,3820,7.4,379440791
2,2,1863,4506,7.2,796061032
3,3,3024,571,9.0,246813744
4,4,2793,2048,2.1,814789007


In [50]:
train, test = python_stratified_split(data, ratio=0.75, col_user='userID', col_item='itemID', seed=42)

In [51]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train['userID'].unique()),
    train_items=len(train['itemID'].unique()),
    test_total=len(test),
    test_users=len(test['userID'].unique()),
    test_items=len(test['itemID'].unique()),
))


Train:
Total Ratings: 4614
Unique Users: 3171
Unique Items: 3025

Test:
Total Ratings: 386
Unique Users: 384
Unique Items: 376



In [52]:
model = SAR(
    col_user="userID",
    col_item="itemID",
    col_rating="productRating",
    col_timestamp="timestamp",
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    timedecay_formula=True
)

In [53]:

with Timer() as train_time:
    model.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

Took 0.4147471000001133 seconds for training.


In [54]:
with Timer() as test_time:
    top_k = model.recommend_k_items(test, remove_seen=True)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 0.08016079999970316 seconds for prediction.


In [61]:
eval_precision = precision_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='productRating', k=TOP_K)

In [63]:
eval_recall = recall_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='productRating', k=TOP_K)

In [65]:
top_k.head()

Unnamed: 0,userID,itemID,prediction
0,6,991,0.0
1,6,2765,0.0
2,6,3473,0.0
3,6,2583,0.0
4,6,3463,0.0


In [67]:
eval_map = map_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='productRating', k= TOP_K)

In [68]:
eval_ndcg = ndcg_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='productRating', k=TOP_K)

In [69]:
print("Model:\t",
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:	
Top K:	10
MAP:	0.000000
NDCG:	0.000000
Precision@K:	0.000000
Recall@K:	0.000000


In [70]:
# Now let's look at the results for a specific user
user_id = 876

ground_truth = test[test['userID']==user_id].sort_values(by='productRating', ascending=False)[:TOP_K]
prediction = model.recommend_k_items(pd.DataFrame(dict(userID=[user_id])), remove_seen=True) 
pd.merge(ground_truth, prediction, on=['userID', 'itemID'], how='left')

ValueError: SAR cannot score users that are not in the training set