In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
! ls

gdrive	sample_data


In [3]:
%cd gdrive/My Drive/Project

/content/gdrive/My Drive/Project


In [4]:
! git clone https://github.com/Microsoft/Recommenders

fatal: destination path 'Recommenders' already exists and is not an empty directory.


In [5]:
! git pull

fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [6]:
pip install papermill

Collecting papermill
  Downloading https://files.pythonhosted.org/packages/2f/9b/13bc32699675dbb5fa12bc8f046c3a57a4b4f43eb5fe1f1e52034f23bb7f/papermill-2.2.2-py3-none-any.whl
Collecting ansiwrap
  Downloading https://files.pythonhosted.org/packages/03/50/43e775a63e0d632d9be3b3fa1c9b2cbaf3b7870d203655710a3426f47c26/ansiwrap-0.8.4-py2.py3-none-any.whl
Collecting black
[?25l  Downloading https://files.pythonhosted.org/packages/dc/7b/5a6bbe89de849f28d7c109f5ea87b65afa5124ad615f3419e71beb29dc96/black-20.8b1.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 20.0MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tenacity
  Downloading https://files.pythonhosted.org/packages/b5/05/ff089032442058bd3386f9cd991cd88ccac81dca1494d78751621ee35e62/tenacity-6.2.0-py2.py3-none-any.whl
Collecting textwrap3>=0.9.2
  Downloading https://files.pythonhosted.org

In [7]:
import os
os.chdir('/content/gdrive/My Drive/Project/Recommenders')

In [8]:
import sys
sys.path.append("../../")
import time
import os
import itertools
import pandas as pd
import numpy as np
import papermill as pm
import torch, fastai
from fastai.collab import EmbeddingDotBias, collab_learner, CollabDataBunch, load_learner

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.recommender.fastai.fastai_utils import cartesian_product, score
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("Cuda Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
Pandas version: 1.1.4
Fast AI version: 1.0.61
Torch version: 1.7.0+cu101
Cuda Available: True
CuDNN Enabled: True


In [9]:
USER, ITEM, RATING, TIMESTAMP, PREDICTION, TITLE = 'UserId', 'MovieId', 'Rating', 'Timestamp', 'Prediction', 'Title'

In [10]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
N_FACTORS = 40
EPOCHS = 5

In [11]:

ratings_df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=[USER,ITEM,RATING,TIMESTAMP]
)

# make sure the IDs are loaded as strings to better prevent confusion with embedding ids
ratings_df[USER] = ratings_df[USER].astype('str')
ratings_df[ITEM] = ratings_df[ITEM].astype('str')

ratings_df.head(10)

100%|██████████| 4.81k/4.81k [00:00<00:00, 21.8kKB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
5,298,474,4.0,884182806
6,115,265,2.0,881171488
7,253,465,5.0,891628467
8,305,451,3.0,886324817
9,6,86,3.0,883603013


In [12]:
# Split the dataset
train_valid_df, test_df = python_stratified_split(
    ratings_df, 
    ratio=0.75, 
    min_rating=1, 
    filter_by="item", 
    col_user=USER, 
    col_item=ITEM
)

In [13]:
train_valid_df[(train_valid_df['MovieId']=='221') & (train_valid_df['UserId']=='321')]

Unnamed: 0,UserId,MovieId,Rating,Timestamp
82400,321,221,5.0,879438793


In [14]:
##Training

In [15]:
# fix random seeds to make sure our runs are reproducible
np.random.seed(101)
torch.manual_seed(101)
torch.cuda.manual_seed_all(101)

In [16]:
start_time = time.time()

data = CollabDataBunch.from_df(train_valid_df, user_name=USER, item_name=ITEM, rating_name=RATING, valid_pct=0)

preprocess_time = time.time() - start_time

In [17]:
data.show_batch()

UserId,MovieId,target
48,423,4.0
210,187,5.0
219,303,4.0
1,124,5.0
92,925,3.0


In [18]:
learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0,5.5], wd=1e-1)
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 40)
  (i_weight): Embedding(1683, 40)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1683, 1)
)

In [19]:
start_time = time.time()

learn.fit_one_cycle(EPOCHS, max_lr=5e-3)

train_time = time.time() - start_time + preprocess_time
print("Took {} seconds for training.".format(train_time))

epoch,train_loss,valid_loss,time
0,0.937871,#na#,00:08
1,0.877238,#na#,00:08
2,0.772738,#na#,00:08
3,0.652344,#na#,00:07
4,0.536503,#na#,00:08


Took 40.370789527893066 seconds for training.


In [20]:
learn.export('movielens_model.pkl')

In [21]:
learner = load_learner(path=".", file = 'movielens_model.pkl')

In [22]:
total_users, total_items = learner.data.train_ds.x.classes.values()
total_items = total_items[1:]
total_users = total_users[1:]

In [23]:
print(total_users,total_items)

['1' '10' '100' '101' ... '96' '97' '98' '99'] ['1' '10' '100' '1000' ... '996' '997' '998' '999']


In [24]:
test_users = test_df[USER].unique()
test_users = np.intersect1d(test_users, total_users)

In [25]:
users_items = cartesian_product(np.array(test_users),np.array(total_items))
users_items = pd.DataFrame(users_items, columns=[USER,ITEM])

In [26]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')
training_removed = training_removed[training_removed[RATING].isna()][[USER, ITEM]]

In [27]:
start_time = time.time()

top_k_scores = score(learner, 
                     test_df=training_removed,
                     user_col=USER, 
                     item_col=ITEM, 
                     prediction_col=PREDICTION)

test_time = time.time() - start_time
print("Took {} seconds for {} predictions.".format(test_time, len(training_removed)))

Took 2.0667576789855957 seconds for 1511060 predictions.


In [28]:
top_k_scores[(top_k_scores['UserId']=='2') & (top_k_scores['MovieId']=='318')]

Unnamed: 0,UserId,MovieId,Prediction
187628,2,318,4.797019


In [29]:
eval_map = map_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                    col_rating=RATING, col_prediction=PREDICTION, 
                    relevancy_method="top_k", k=TOP_K)

In [30]:
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                      col_rating=RATING, col_prediction=PREDICTION, 
                      relevancy_method="top_k", k=TOP_K)

In [31]:
eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                                col_rating=RATING, col_prediction=PREDICTION, 
                                relevancy_method="top_k", k=TOP_K)

In [32]:

eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                          col_rating=RATING, col_prediction=PREDICTION, 
                          relevancy_method="top_k", k=TOP_K)

In [33]:

print("Model:\t" + learn.__class__.__name__,
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:	CollabLearner
Top K:	10
MAP:	0.027680
NDCG:	0.158812
Precision@K:	0.139661
Recall@K:	0.057563


In [34]:
scores = score(learner, 
               test_df=test_df.copy(), 
               user_col=USER, 
               item_col=ITEM, 
               prediction_col=PREDICTION)

In [35]:

eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)

print("Model:\t" + learn.__class__.__name__,
      "RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae,
      "Explained variance:\t%f" % eval_exp_var,
      "R squared:\t%f" % eval_r2, sep='\n')

Model:	CollabLearner
RMSE:	0.902230
MAE:	0.712558
Explained variance:	0.346533
R squared:	0.345887


In [36]:
pip install nteract-scrapbook

Collecting nteract-scrapbook
[?25l  Downloading https://files.pythonhosted.org/packages/2d/06/c026c536ee7f671540836ba44e686edfbb1d50981db774fd16d336515664/nteract_scrapbook-0.4.1-py3-none-any.whl (265kB)
[K     |█▎                              | 10kB 21.8MB/s eta 0:00:01[K     |██▌                             | 20kB 26.4MB/s eta 0:00:01[K     |███▊                            | 30kB 19.7MB/s eta 0:00:01[K     |█████                           | 40kB 19.5MB/s eta 0:00:01[K     |██████▏                         | 51kB 20.1MB/s eta 0:00:01[K     |███████▍                        | 61kB 22.1MB/s eta 0:00:01[K     |████████▋                       | 71kB 23.6MB/s eta 0:00:01[K     |█████████▉                      | 81kB 25.0MB/s eta 0:00:01[K     |███████████                     | 92kB 23.7MB/s eta 0:00:01[K     |████████████▍                   | 102kB 25.0MB/s eta 0:00:01[K     |█████████████▋                  | 112kB 25.0MB/s eta 0:00:01[K     |██████████████▉         

In [37]:
import scrapbook as sc
# Record results with papermill for tests
sc.glue("map", eval_map)
sc.glue("ndcg", eval_ndcg)
sc.glue("precision", eval_precision)
sc.glue("recall", eval_recall)
sc.glue("rmse", eval_rmse)
sc.glue("mae", eval_mae)
sc.glue("exp_var", eval_exp_var)
sc.glue("rsquared", eval_r2)
sc.glue("train_time", train_time)
sc.glue("test_time", test_time)