# Running SAR on MovieLens (Single Node)

SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history and item descriptions. It produces easily explainable / interpretable recommendations and handles "cold item" and "semi-cold user" scenarios.

In [1]:
# set the environment path to find Recommenders
import sys
sys.path.append("..")

from utilities.recommender.sar.sar_singlenode import SARSingleNodeReference
from utilities.dataset.url_utils import maybe_download
from utilities.dataset.splitters_python import pandas_random_split
from utilities.evaluation.python_evaluation import PythonRatingEvaluation, PythonRankingEvaluation

import itertools
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


### 1. Download the MovieLens dataset

In [2]:
filepath = maybe_download("http://files.grouplens.org/datasets/movielens/ml-100k/u.data", "ml-100k.data")
data = pd.read_csv("ml-100k.data", sep="\t", names=["UserId", "MovieId", "Rating", "Timestamp"])

In [3]:
display(data.head())

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Next, we split the data using the Random Splitter function provided in utilities:

In [4]:
train, test = pandas_random_split(data)

In [5]:
header = {
        "col_user": "UserId",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
    }

model = SARSingleNodeReference(
                remove_seen=True, similarity_type="jaccard", 
                time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header
            )

In [6]:
unique_users = data[header["col_user"]].unique()
unique_items = data[header["col_item"]].unique()

In [7]:
# Hash users and items to smaller continuous space.
# Actually, this is an ordered set - it's discrete, but contiguous.
# This helps keep the matrices we keep in memory as small as possible.
enumerate_items_1, enumerate_items_2 = itertools.tee(enumerate(unique_items))
enumerate_users_1, enumerate_users_2 = itertools.tee(enumerate(unique_users))
item_map_dict = {x: i for i, x in enumerate_items_1}
user_map_dict = {x: i for i, x in enumerate_users_1}

In [8]:
# the reverse of the dictionary above - array index to actual ID
index2user = dict(enumerate_users_2)
index2item = dict(enumerate_items_2)

In [9]:
# we need to index the train and test sets for SAR matrix operations to work
model.set_index(unique_users, unique_items, user_map_dict, item_map_dict, index2user, index2item)

In [10]:
model.fit(train)
top_k = model.recommend_k_items(test)

INFO:utilities.recommender.sar.sar_singlenode:Collecting user affinity matrix...
INFO:utilities.recommender.sar.sar_singlenode:Calculating time-decayed affinities...
INFO:utilities.recommender.sar.sar_singlenode:Creating index columns...
INFO:utilities.recommender.sar.sar_singlenode:Building user affinity sparse matrix...
INFO:utilities.recommender.sar.sar_singlenode:Calculating item cooccurrence...
INFO:utilities.recommender.sar.sar_singlenode:Calculating item similarity...
INFO:utilities.recommender.sar.sar_singlenode:Calculating jaccard...
  return np.true_divide(self.todense(), other)
  return np.true_divide(self.todense(), other)
INFO:utilities.recommender.sar.sar_singlenode:Calculating recommendation scores...
INFO:utilities.recommender.sar.sar_singlenode:done training
INFO:utilities.recommender.sar.sar_singlenode:Converting to dense matrix...
INFO:utilities.recommender.sar.sar_singlenode:Removing seen items...
INFO:utilities.recommender.sar.sar_singlenode:Getting top K...
INFO:u

In [11]:
# TODO: remove this call when the model returns same type as input
top_k['UserId'] = pd.to_numeric(top_k['UserId'])

In [12]:
display(top_k.head())

Unnamed: 0,UserId,MovieId,prediction
1016,796,82,185.718675
1018,796,568,179.945713
1015,796,174,175.274134
1017,796,69,175.131597
3177,551,79,171.731459


### Next, we will evaluate how well SAR performs 

In [13]:
display(test.head())

Unnamed: 0,UserId,MovieId,Rating,Timestamp,hashedUsers
80146,630,282,3,885666804,632
28253,256,174,4,882164406,117
96160,943,50,4,875501835,937
28788,386,117,5,877655028,383
65108,582,748,3,882960601,581


In [14]:
rank_eval = PythonRankingEvaluation(test, top_k, col_user="UserId", col_item="MovieId", 
                                    col_rating="Rating", col_prediction="prediction", 
                                    relevancy_method="top_k")

In [15]:
rank_eval.precision_at_k()

0.31943069306930694