# Running SAR on MovieLens (sarplus)

This notebook provides an example of how to utilize and evaluate SAR using sarplus, which is based on PySpark (SQL + C++).

The biggest advantage of sarplus is scalability of the prediction part, as it's the only limitation is that the similiarity matrix needs to fit in memory on each worker.

In [1]:
# set the environment path to find Recommenders
import sys
sys.path.append("../../")

from reco_utils.dataset.url_utils import maybe_download
from reco_utils.dataset.spark_splitters import spark_random_split
from reco_utils.evaluation.spark_evaluation import SparkRankingEvaluation
from reco_utils.common.notebook_utils import is_jupyter, is_databricks

import itertools
import os
import pandas as pd
from pyspark.sql import SparkSession
from pysarplus import SARPlus

from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

# top k items to recommend
TOP_K = 10
# do not recommend items which appear in the training set
RECOMMEND_SEEN = True
# dataset size
MOVIELENS_DATA_SIZE = "100k"


System version: 3.6.0 | packaged by conda-forge | (default, Feb  9 2017, 14:36:55) 
[GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]
Pandas version: 0.23.4


### 0. Start Spark and load sar+

In [2]:
SUBMIT_ARGS = "--packages eisber:sarplus:0.2.2 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

spark = (
    SparkSession
    .builder
    .appName("SAR pySpark")
    .master("local[*]")
    .config("spark.executor.cores", 8)
    .config("spark.executor.memory", "48g")
    .config("spark.executor.instances", 1)
    .config("spark.sql.shuffle.partitions", "10")
    .config("spark.sql.crossJoin.enabled", True)
    .config("spark.ui.enabled", False)
    .getOrCreate()
)

"""
spark = (
    SparkSession.builder.appName("sample")
    .master("local[*]")
    .config("memory", "16G")
    .config("spark.sql.shuffle.partitions", "10")
    .config("spark.sql.crossJoin.enabled", True)
    .config("spark.ui.enabled", False)
    .getOrCreate()
)
"""

'\nspark = (\n    SparkSession.builder.appName("sample")\n    .master("local[*]")\n    .config("memory", "16G")\n    .config("spark.sql.shuffle.partitions", "10")\n    .config("spark.sql.crossJoin.enabled", True)\n    .config("spark.ui.enabled", False)\n    .getOrCreate()\n)\n'

### 1. Download the MovieLens dataset

In [3]:
# MovieLens data have different data-format for each size of dataset
data_header = False
if MOVIELENS_DATA_SIZE == "100k":
    separator = "\t"
    data_path = "ml-100k/u.data"
elif MOVIELENS_DATA_SIZE == "1m":
    separator = "::"
    data_path = "ml-1m/ratings.dat"
elif MOVIELENS_DATA_SIZE == "10m":
    separator = "::"
    data_path = "ml-10M100K/ratings.dat"
elif MOVIELENS_DATA_SIZE == "20m":
    separator = ","
    data_path = "ml-20m/ratings.csv"
    data_header = True
else:
    raise ValueError("Invalid data size. Should be one of {100k, 1m, 10m, or 20m}")

# Download dataset zip file and decompress if haven't done yet
dest_folder = "."
dest_file = data_path

if is_databricks():
    # Handle local file I/O APIs on Databricks
    dest_folder = "/dbfs/tmp"
    dest_file = os.path.join(dest_folder, dest_file)
    data_path = "dbfs:/tmp/" + data_path
    
if not os.path.exists(dest_file):
    filename = "ml-" + MOVIELENS_DATA_SIZE + ".zip"
    filepath = maybe_download(
        "http://files.grouplens.org/datasets/movielens/" + filename, filename
    )

    with ZipFile(filepath, "r") as zf:
        zf.extractall(dest_folder)

    # remove zip file we already used
    os.remove(filepath)

# Force the file to be flushed to persistent storage
if is_databricks():
    # In Databricks, passing python variable to shell command like "!sync {dest_file}" does not work.
    if MOVIELENS_DATA_SIZE == "100k":
        !sync "/dbfs/tmp/ml-100k/u.data"
    elif MOVIELENS_DATA_SIZE == "1m":
        !sync "/dbfs/tmp/ml-1m/ratings.dat"
    elif MOVIELENS_DATA_SIZE == "10m":
        !sync "/dbfs/tmp/ml-10M100K/ratings.dat"
    elif MOVIELENS_DATA_SIZE == "20m":
        !sync "/dbfs/tmp/ml-20m/ratings.csv"
else:
    !sync {dest_file}
    
# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.
schema = StructType(
    (
        StructField("UserId", IntegerType()),
        StructField("MovieId", IntegerType()),
        StructField("Rating", FloatType()),
        StructField("Timestamp", LongType()),
    )
)

# pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
if len(separator) > 1:
    raw_data = spark.sparkContext.textFile(data_path)
    # In databricks (or maybe in multi-cluster machines), somehow file 
    raw_data.take(1)
    data_rdd = raw_data.map(lambda l: l.split(separator)) \
        .map(lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])])
    data = spark.createDataFrame(data_rdd, schema)
else:
    data = spark.read.csv(data_path, schema=schema, sep=separator, header=data_header)

data.show()

+------+-------+------+---------+
|UserId|MovieId|Rating|Timestamp|
+------+-------+------+---------+
|   196|    242|   3.0|881250949|
|   186|    302|   3.0|891717742|
|    22|    377|   1.0|878887116|
|   244|     51|   2.0|880606923|
|   166|    346|   1.0|886397596|
|   298|    474|   4.0|884182806|
|   115|    265|   2.0|881171488|
|   253|    465|   5.0|891628467|
|   305|    451|   3.0|886324817|
|     6|     86|   3.0|883603013|
|    62|    257|   2.0|879372434|
|   286|   1014|   5.0|879781125|
|   200|    222|   5.0|876042340|
|   210|     40|   3.0|891035994|
|   224|     29|   3.0|888104457|
|   303|    785|   3.0|879485318|
|   122|    387|   5.0|879270459|
|   194|    274|   2.0|879539794|
|   291|   1042|   4.0|874834944|
|   234|   1184|   2.0|892079237|
+------+-------+------+---------+
only showing top 20 rows



### 2. Split the data using the python random splitter provided in utilities:

In [4]:
df_train, df_test = spark_random_split(data)
header = {
        "col_user": "UserId",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
    }

### 3. Train the SAR model on our training data, and get the top-k recommendations for our testing data

In [5]:
#df_train = spark.createDataFrame(train)
#df_test = spark.createDataFrame(test)

model = SARPlus(spark, **header)
model.fit(df_train, similarity_type='jaccard', 
          time_decay_coefficient=30, time_now=None, timedecay_formula=True)

top_k = model.recommend_k_items(df_test, 'sarplus_cache', top_k=TOP_K, remove_seen=RECOMMEND_SEEN)

top_k.show()

INFO:sarplus:sarplus.fit 1/2: compute item cooccurences...
INFO:sarplus:sarplus.fit 2/2: compute similiarity metric jaccard...
INFO:sarplus:sarplus.recommend_k_items 1/3: create item index
INFO:sarplus:sarplus.recommend_k_items 2/3: prepare similarity matrix
INFO:sarplus:sarplus.recommend_k_items 3/3: compute recommendations


+------+-------+----------+
|UserId|MovieId|     score|
+------+-------+----------+
|   148|     28| 0.5874176|
|   148|    238|0.58822286|
|   148|    210|  0.593227|
|   148|     79| 0.5953816|
|   148|    191|0.60170484|
|   148|    168|0.61572987|
|   148|    172| 0.6162293|
|   148|    423|0.62143284|
|   148|    195|0.63030726|
|   148|    174|0.63451064|
|   496|     95| 0.5524776|
|   496|     69|0.55337024|
|   496|    403| 0.5535351|
|   496|    238| 0.5562565|
|   496|    173| 0.5616334|
|   496|    176|0.56371635|
|   496|     79|0.56943566|
|   496|    172| 0.5702657|
|   496|    210|0.57208157|
|   496|    423|0.58741516|
+------+-------+----------+
only showing top 20 rows



### 4. Evaluate how well SAR performs 

In [6]:
rank_eval = SparkRankingEvaluation(df_test, top_k, k = TOP_K, col_user="UserId", col_item="MovieId", 
                                    col_rating="Rating", col_prediction="score", 
                                    relevancy_method="top_k")

print("Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')

Top K:	10
MAP:	0.110195
NDCG:	0.378840
Precision@K:	0.327176
Recall@K:	0.178576
