In [1]:
%load_ext autoreload
%autoreload 2

# set the environment path to find Recommenders
import sys
sys.path.append("../../")

import logging
import numpy as np
import pandas as pd
import scrapbook as sb
from sklearn.preprocessing import minmax_scale

from reco_utils.common.python_utils import binarize
from reco_utils.common.timer import Timer
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    rmse,
    mae,
    logloss,
    rsquared,
    exp_var,
    get_top_k_items
)
from reco_utils.recommender.fbt.fbt import FBT

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.6.11 | packaged by conda-forge | (default, Nov 27 2020, 18:51:43) 
[GCC Clang 11.0.0]
Pandas version: 1.1.5


In [2]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

### 1.1 Download and use the MovieLens Dataset

In [3]:
col_user = 'user_id'
col_item = 'item_id'
col_item_name = f'{col_item}_name'
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=(col_user, col_item),
    title_col=col_item_name
)
data.head()

100%|██████████| 4.81k/4.81k [00:01<00:00, 3.56kKB/s]


Unnamed: 0,user_id,item_id,item_id_name
0,196,242,Kolya (1996)
1,63,242,Kolya (1996)
2,226,242,Kolya (1996)
3,154,242,Kolya (1996)
4,306,242,Kolya (1996)


### 1.2 Split the data using the python random splitter provided in utilities:

We split the full dataset into a `train` and `test` dataset to evaluate performance of the algorithm against a held-out set not seen during training. Because FBT generates recommendations based on user preferences, all users that are in the test set must also exist in the training set. For this case, we can use the provided `python_stratified_split` function which holds out a percentage (in this case 25%) of items from each user, but ensures all users are in both `train` and `test` datasets. Other options are available in the `dataset.python_splitters` module which provide more control over how the split occurs.

In [4]:
train, test = python_stratified_split(data, 
                                      ratio=0.75, 
                                      col_user=col_user, 
                                      col_item=col_item, 
                                      seed=42)

In [5]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train[col_user].unique()),
    train_items=len(train[col_item].unique()),
    test_total=len(test),
    test_users=len(test[col_user].unique()),
    test_items=len(test[col_item].unique()),
))


Train:
Total Ratings: 74992
Unique Users: 943
Unique Items: 1601

Test:
Total Ratings: 25008
Unique Users: 943
Unique Items: 1532



# 2 Train the FBT Model

In [6]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

col_score = 'score'
model = FBT(
    col_user=col_user,
    col_item=col_item,
    col_score=col_score,
    num_recos=10
)

In [7]:
with Timer() as train_time:
    model.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

2021-05-24 12:07:26,533 INFO     Check dataframe is of the type, schema we expect
2021-05-24 12:07:26,561 INFO     De-duplicating the user-item counts
2021-05-24 12:07:29,271 INFO     Done training
Took 2.766911569982767 seconds for training.


In [8]:
with Timer() as test_time:
    topk_remove_seen = model.recommend_k_items(test=test, 
                                               top_k=10, 
                                               remove_seen=True, 
                                               train=train)

print("Took {} seconds for prediction.".format(test_time.interval))

2021-05-24 12:07:41,444 INFO     Calculating recommendation scores
2021-05-24 12:07:45,845 INFO     De-duplicating the user-item counts
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
Took 5.924376523995306 seconds for prediction.


In [9]:
with Timer() as test_time:
    topk_keep_seen = model.recommend_k_items(test=test, top_k=10, remove_seen=False)
print("Took {} seconds for prediction.".format(test_time.interval))

2021-05-24 12:07:57,584 INFO     Calculating recommendation scores
Took 4.818754572013859 seconds for prediction.


In [10]:
topk_remove_seen.head()

Unnamed: 0,user_id,item_id,score,rank
0,1,98,55.149254,1
1,1,56,50.283582,2
2,1,69,47.925373,3
3,1,423,47.720588,4
4,1,204,46.970149,5


In [11]:
topk_keep_seen.head()

Unnamed: 0,user_id,item_id,score,rank
0,1,50,66.735294,1
1,1,181,61.397059,2
2,1,174,59.544118,3
3,1,1,56.985294,4
4,1,98,55.149254,5


In [12]:
with Timer() as predict_time:
    all_recos = model.predict(test)
print("Took {} seconds for prediction.".format(predict_time.interval))
all_recos.head()

2021-05-24 12:08:15,401 INFO     Calculating recommendation scores
Took 3.4837634110008366 seconds for prediction.


Unnamed: 0,user_id,item_id,score
0,1,1,56.985294
1,1,2,23.757576
2,1,3,16.815385
3,1,4,35.537313
4,1,5,17.621212


In [13]:
print(model.item_frequencies)

         item_id  score
0              1    395
1488           2     95
2829           3     73
4128           4    161
5548           5     68
...          ...    ...
1597039     1673      1
1597196     1676      1
1597274     1678      1
1597355     1679      1
1597436     1680      1

[1601 rows x 2 columns]


In [14]:
model._model_df

Unnamed: 0,item_id,item_id_paired,score
1,1,2,61
2,1,3,51
3,1,4,91
4,1,5,41
5,1,6,7
...,...,...,...
1597431,1680,1313,1
1597432,1680,1395,1
1597433,1680,1607,1
1597434,1680,1678,1


In [15]:
topk_remove_seen_with_titles = (
    topk_remove_seen.merge((
        data.loc[:, [col_item, col_item_name]]
            .drop_duplicates()
            .set_index(col_item)
    ), on=col_item, how='inner')
    .sort_values(by=[col_user, col_score], ascending=[True, False])
    .reset_index(drop=True)
)
        
display(topk_remove_seen_with_titles.head(10))

Unnamed: 0,user_id,item_id,score,rank,item_id_name
0,1,98,55.149254,1,"Silence of the Lambs, The (1991)"
1,1,56,50.283582,2,Pulp Fiction (1994)
2,1,69,47.925373,3,Forrest Gump (1994)
3,1,423,47.720588,4,E.T. the Extra-Terrestrial (1982)
4,1,204,46.970149,5,Back to the Future (1985)
5,1,288,46.941176,6,Scream (1996)
6,1,117,44.597015,7,"Rock, The (1996)"
7,1,294,43.166667,8,Liar Liar (1997)
8,1,183,42.939394,9,Alien (1979)
9,1,238,42.358209,10,Raising Arizona (1987)


### 2.3. Evaluate how well FBT performs

We evaluate how well FBT performs for a few common ranking metrics provided in the `python_evaluation` module in reco_utils. We will consider the Mean Average Precision (MAP), Normalized Discounted Cumalative Gain (NDCG), Precision, and Recall for the top-k items per user we computed with FBT. User and item column names are specified in each evaluation method. DInce FBT does not have ratings information, we create a dummy column with all values set to 1.0 so as to conform to the metrics signature.

In [18]:
test['rating'] = 1
eval_map_k = map_at_k(test, topk_remove_seen, col_user=col_user, col_item=col_item, col_prediction=col_score,k=TOP_K)
eval_map_k

0.044028595315000515

In [19]:
eval_ndcg = ndcg_at_k(test, topk_remove_seen, col_user=col_user, col_item=col_item, col_prediction=col_score, k=TOP_K)
eval_ndcg

0.2443432424633656

In [20]:
eval_precision = precision_at_k(test, topk_remove_seen, col_user=col_user, col_item=col_item, col_prediction=col_score, k=TOP_K)
eval_precision

0.2292682926829268

In [21]:
eval_recall = recall_at_k(test, topk_remove_seen, col_user=col_user, col_item=col_item, col_prediction=col_score, k=TOP_K)
eval_recall

0.09436047878760673

In [22]:
eval_rmse = rmse(test, topk_remove_seen, col_user=col_user, col_item=col_item, col_prediction=col_score)
eval_rmse

65.1847870130108

In [23]:
eval_mae = mae(test, topk_remove_seen, col_user=col_user, col_item=col_item, col_prediction=col_score)
eval_mae

62.78606036179992

In [24]:
print("Model:\t",
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map_k,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall,
      "RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae,
      sep='\n')

Model:	
Top K:	10
MAP:	0.044029
NDCG:	0.244343
Precision@K:	0.229268
Recall@K:	0.094360
RMSE:	65.184787
MAE:	62.786060


In [25]:
# Now let's look at the results for a specific user
user_id = 1

ground_truth = test[test[col_user]==user_id]
prediction = topk_remove_seen[topk_remove_seen[col_user]==user_id].sort_values(by=col_score, ascending=False)[:TOP_K]
test_user_movie_watched_prediction = (
    pd.merge(ground_truth, prediction, on=[col_user, col_item], how='left')
      .drop(columns=['rating'])
)
display(test_user_movie_watched_prediction.head())

Unnamed: 0,user_id,item_id,item_id_name,score,rank
0,1,49,I.Q. (1994),,
1,1,69,Forrest Gump (1994),47.925373,3.0
2,1,221,Breaking the Waves (1996),,
3,1,5,Copycat (1995),,
4,1,139,"Love Bug, The (1969)",,


Above, we see that one of the movies from the test set was recovered by the model's top-k recommendations, however the others were not. Offline evaluations are difficult as they can only use what was seen previously in the test set and may not represent the user's actual preferences across the entire set of items. Adjustments to how the data is split, algorithm is used and hyper-parameters can improve the results here. 

In [27]:
# Record results with papermill for tests - ignore this cell
sb.glue("map", eval_map_k)
sb.glue("ndcg", eval_ndcg)
sb.glue("precision", eval_precision)
sb.glue("recall", eval_recall)
sb.glue("train_time", train_time.interval)
sb.glue("test_time", test_time.interval)