# Running SAR on MovieLens (pySpark)

SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history and item descriptions. It produces easily explainable / interpretable recommendations and handles "cold item" and "semi-cold user" scenarios. 

This notebook provides an example of how to utilize and evaluate SAR's pySpark implementation, meant for large-scale distributed datasets. We use a smaller dataset in this example to run SAR efficiently on Data Science Virtual Machine.

In [1]:
# set the environment path to find Recommenders
import sys
sys.path.append("../../")
import os
from zipfile import ZipFile

from reco_utils.recommender.sar.sar_pyspark import SARpySparkReference
from reco_utils.dataset.url_utils import maybe_download
from reco_utils.dataset.spark_splitters import spark_random_split
from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from reco_utils.common.notebook_utils import is_jupyter, is_databricks

import numpy as np

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

print("System version: {}".format(sys.version))
print("Spark version: {}".format(pyspark.__version__))


System version: 3.6.0 | packaged by conda-forge | (default, Feb 10 2017, 07:08:35) 
[GCC 4.2.1 Compatible Apple LLVM 7.3.0 (clang-703.0.31)]
Spark version: 2.3.1


Set the default parameters.

In [2]:
# top k items to recommend
TOP_K = 10
# do not recommend items which appear in the training set
RECOMMEND_SEEN = False
# dataset size
MOVIELENS_DATA_SIZE = "100k"

### 0. Set up Spark context

The following settings work well for debugging locally on VM - change when running on a cluster. We set up a giant single executor with many threads and specify memory cap. 

In [3]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = SparkSession \
    .builder \
    .appName("SAR pySpark") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.memory", "8g")\
    .config("spark.executor.memoryOverhead", "3g")\
    .config("spark.memory.fraction", "0.9")\
    .config("spark.memory.stageFraction", "0.3")\
    .config("spark.executor.instances", 1)\
    .config("spark.executor.heartbeatInterval", "36000s")\
    .config("spark.network.timeout", "10000000s")\
    .config("spark.driver.maxResultSize", "50g")\
    .config("spark.sql.shuffle.partitions", "10")\
    .getOrCreate()


### 1. Download the MovieLens dataset

In [4]:
# MovieLens data have different data-format for each size of dataset
data_header = False
if MOVIELENS_DATA_SIZE == "100k":
    separator = "\t"
    data_path = "ml-100k/u.data"
elif MOVIELENS_DATA_SIZE == "1m":
    separator = "::"
    data_path = "ml-1m/ratings.dat"
elif MOVIELENS_DATA_SIZE == "10m":
    separator = "::"
    data_path = "ml-10M100K/ratings.dat"
elif MOVIELENS_DATA_SIZE == "20m":
    separator = ","
    data_path = "ml-20m/ratings.csv"
    data_header = True
else:
    raise ValueError("Invalid data size. Should be one of {100k, 1m, 10m, or 20m}")

# Download dataset zip file and decompress if haven't done yet
dest_folder = "."
dest_file = data_path

if is_databricks():
    # Handle local file I/O APIs on Databricks
    dest_folder = "/dbfs/tmp"
    dest_file = os.path.join(dest_folder, dest_file)
    data_path = "dbfs:/tmp/" + data_path
    
if not os.path.exists(dest_file):
    filename = "ml-" + MOVIELENS_DATA_SIZE + ".zip"
    filepath = maybe_download(
        "http://files.grouplens.org/datasets/movielens/" + filename, filename
    )

    with ZipFile(filepath, "r") as zf:
        zf.extractall(dest_folder)

    # remove zip file we already used
    os.remove(filepath)

# Force the file to be flushed to persistent storage
if is_databricks():
    # In Databricks, passing python variable to shell command like "!sync {dest_file}" does not work.
    if MOVIELENS_DATA_SIZE == "100k":
        !sync "/dbfs/tmp/ml-100k/u.data"
    elif MOVIELENS_DATA_SIZE == "1m":
        !sync "/dbfs/tmp/ml-1m/ratings.dat"
    elif MOVIELENS_DATA_SIZE == "10m":
        !sync "/dbfs/tmp/ml-10M100K/ratings.dat"
    elif MOVIELENS_DATA_SIZE == "20m":
        !sync "/dbfs/tmp/ml-20m/ratings.csv"
else:
    !sync {dest_file}

In [5]:
# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.
schema = StructType(
    (
        StructField("UserId", IntegerType()),
        StructField("MovieId", IntegerType()),
        StructField("Rating", FloatType()),
        StructField("Timestamp", LongType()),
    )
)

# pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
if len(separator) > 1:
    raw_data = spark.sparkContext.textFile(data_path)
    # In databricks (or maybe in multi-cluster machines), somehow file 
    raw_data.take(1)
    data_rdd = raw_data.map(lambda l: l.split(separator)) \
        .map(lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])])
    data = spark.createDataFrame(data_rdd, schema)
else:
    data = spark.read.csv(data_path, schema=schema, sep=separator, header=data_header)

data.show()

+------+-------+------+---------+
|UserId|MovieId|Rating|Timestamp|
+------+-------+------+---------+
|   196|    242|   3.0|881250949|
|   186|    302|   3.0|891717742|
|    22|    377|   1.0|878887116|
|   244|     51|   2.0|880606923|
|   166|    346|   1.0|886397596|
|   298|    474|   4.0|884182806|
|   115|    265|   2.0|881171488|
|   253|    465|   5.0|891628467|
|   305|    451|   3.0|886324817|
|     6|     86|   3.0|883603013|
|    62|    257|   2.0|879372434|
|   286|   1014|   5.0|879781125|
|   200|    222|   5.0|876042340|
|   210|     40|   3.0|891035994|
|   224|     29|   3.0|888104457|
|   303|    785|   3.0|879485318|
|   122|    387|   5.0|879270459|
|   194|    274|   2.0|879539794|
|   291|   1042|   4.0|874834944|
|   234|   1184|   2.0|892079237|
+------+-------+------+---------+
only showing top 20 rows



### 2. Split the data using the Spark random splitter provided in utilities

In [6]:
train, test = spark_random_split(data, ratio=0.75, seed=123)
print ("N train", train.count())
print ("N test", test.count())

N train 75193
N test 24807


In [7]:
header = {
        "col_user": "UserId",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
    }

model = SARpySparkReference(spark=spark,
                remove_seen=True, similarity_type="jaccard", 
                time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header
            )

### 3. Cold User filter
In order to use SAR, we need to hash users and items and make sure there are no cold users.

In [8]:
train_set_users = set([x[0] for x in train.select(header["col_user"]).distinct().collect()])
test_set_users = set([x[0] for x in test.select(header["col_user"]).distinct().collect()])
both_sets = train_set_users.intersection(test_set_users)
test = test.filter(F.col(header["col_user"]).isin(both_sets))
print ("N train", train.count())
print ("N test", test.count())

N train 75193
N test 24807


#### Build uniform index

In [9]:
# we need to index item IDs which we want to score later, i.e. we need to consider all items
train = train.withColumn('type', F.lit(1))
test = test.withColumn('type', F.lit(0))
df_all = train.union(test)
df_all.createOrReplaceTempView("df_all")

# create new index for the items
query = "select " + header["col_user"] + ", " +\
    "dense_rank() over(partition by 1 order by " + header["col_user"] + ") as row_id, " +\
                    header["col_item"] + ", " +\
    "dense_rank() over(partition by 1 order by " + header["col_item"] + ") as col_id, " +\
        header["col_rating"] + ", " + header["col_timestamp"] + ", type from df_all"
print("Running query -- " + query)
df_all = spark.sql(query)
df_all.createOrReplaceTempView("df_all")

Running query -- select UserId, dense_rank() over(partition by 1 order by UserId) as row_id, MovieId, dense_rank() over(partition by 1 order by MovieId) as col_id, Rating, Timestamp, type from df_all


#### Recover the original data but now with index built-in
Obtain the indexed dataframes.

In [10]:
query = "select row_id, col_id, " + header["col_rating"] + ", " + header["col_timestamp"] + " from df_all where type=1"
print("Running query -- " + query)
train_indexed = spark.sql(query)

query = "select row_id, col_id, " + header["col_rating"] + ", " + header["col_timestamp"] + " from df_all where type=0"
print("Running query -- " + query)
test_indexed = spark.sql(query)


Running query -- select row_id, col_id, Rating, Timestamp from df_all where type=1
Running query -- select row_id, col_id, Rating, Timestamp from df_all where type=0


Build index mappings: IDs to index and index to IDs.
First we obtain all users and items which are used later in SAR.

In [11]:
unique_users =\
    np.array([x[header["col_user"]] for x in df_all.select(header["col_user"]).distinct().toLocalIterator()])
unique_items =\
    np.array([x[header["col_item"]] for x in df_all.select(header["col_item"]).distinct().toLocalIterator()])


Indexing users and items: index all rows and columns, then split again into train and test. We perform the reduction on Spark across keys before calling .collect so this is scalable. The assumption is that we can store at least the full list of unique users and unique items on a single machine (vertical scaling).

We also reverse the dictionaries in order to go ther othe way. Index to item is used to return top_k DataFrame later by undoing the index. For performance reasons we can use vector array to store index to ID mapping, but we're using dictionary for convenience (both are O(1) access anyway).


In [12]:
index2user = \
    dict(df_all.select(["row_id", header["col_user"]]).rdd.reduceByKey(lambda _, v: v).collect())
index2item = \
    dict(df_all.select(["col_id", header["col_item"]]).rdd.reduceByKey(lambda _, v: v).collect())

user_map_dict = {v: k for k, v in index2user.items()}
item_map_dict = {v: k for k, v in index2item.items()}

Store the index values in the model object.

In [13]:
model.set_index(unique_users, unique_items, user_map_dict, item_map_dict, index2user, index2item)

### 4. Train the SAR model on our training data, and get the top-k recommendations for our testing data

In [14]:
model.fit(train_indexed)
top_k = model.recommend_k_items(test_indexed, top_k = TOP_K)

In [15]:
top_k.show()

+------+-------+----------+
|UserId|MovieId|prediction|
+------+-------+----------+
|   551|    195| 155.13187|
|   551|    161| 148.76941|
|   551|    174|  145.6567|
|   551|    202| 144.97148|
|   551|    173| 142.98204|
|   796|     82| 141.05814|
|   796|    174| 140.94711|
|   796|    202| 140.84773|
|   551|    168| 140.36824|
|   551|    144| 140.09683|
|   551|    216| 139.22217|
|   551|    367| 138.58034|
|   551|     98| 138.57614|
|   796|     96| 137.61952|
|   796|    393| 136.54305|
|   796|    176| 136.16814|
|   796|     28| 134.18773|
|   796|    186| 133.72394|
|   796|     58| 132.76549|
|   796|    402| 132.04024|
+------+-------+----------+
only showing top 20 rows



In [16]:
# print timer info if in debug mode
if model.debug:
    for x in model.timer_log:
        print (x)

affinity calculation time (s)	6
item_cooccurrence calculation time (s)	11
item_similarity calculation time (s)	21
scores calculation time (s)	20
masked_scores calculation time (s)	19
top_scores calculation time (s)	27
top_scores calculation time (s)	32


### 5. Evaluate how well SAR performs 

In [17]:
test.show()

+------+-------+------+---------+----+
|UserId|MovieId|Rating|Timestamp|type|
+------+-------+------+---------+----+
|     1|      2|   3.0|876893171|   0|
|     1|      3|   4.0|878542960|   0|
|     1|      4|   3.0|876893119|   0|
|     1|      9|   5.0|878543541|   0|
|     1|     11|   2.0|875072262|   0|
|     1|     17|   3.0|875073198|   0|
|     1|     25|   4.0|875071805|   0|
|     1|     28|   4.0|875072173|   0|
|     1|     30|   3.0|878542515|   0|
|     1|     33|   4.0|878542699|   0|
|     1|     43|   4.0|878542869|   0|
|     1|     48|   5.0|875072520|   0|
|     1|     49|   3.0|878542478|   0|
|     1|     52|   4.0|875072205|   0|
|     1|     59|   5.0|876892817|   0|
|     1|     62|   3.0|878542282|   0|
|     1|     65|   4.0|875072125|   0|
|     1|     66|   4.0|878543030|   0|
|     1|     71|   3.0|876892425|   0|
|     1|     78|   1.0|878543176|   0|
+------+-------+------+---------+----+
only showing top 20 rows



In [18]:
rank_eval = SparkRankingEvaluation(test, top_k, k = TOP_K, col_user="UserId", col_item="MovieId", 
                                    col_rating="Rating", col_prediction="prediction", 
                                    relevancy_method="top_k")

In [19]:
print("Model:\t" + model.model_str,
      "Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')

Model:	sar_pyspark
Top K:	10
MAP:	0.110195
NDCG:	0.378840
Precision@K:	0.327176
Recall@K:	0.178576
