In [1]:
# set the environment path to find Recommenders
import sys
sys.path.append("..")

from utilities.recommender.sar import sar_pyspark
from pyspark.sql import SparkSession
from utilities.common.file_utils import maybe_download
import pyspark.sql.functions as F
import logging
import numpy as np

# needs JDK 1.8 on Windows with pySpark 2.3.1

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

spark = SparkSession \
    .builder \
    .appName("MovieRatingsSARpySpark") \
    .master("local[*]") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.memory", "6g") \
    .config("spark.memory.fraction", "0.9")\
    .config("spark.memory.stageFraction", "0.3")\
    .config("spark.executor.instances", 1) \
    .config("spark.executor.cores", "4")\
    .getOrCreate()
    
spark

In [2]:
data_location = 'movielens1m.csv'

maybe_download('http://aka.ms', data_location, data_location, verbose=True)

header = {
    'col_user': "UserId",
    'col_item': "MovieId",
    'col_rating': "Rating",
    'col_timestamp': "Timestamp"
}

# TODO: do we really need a logger for a notebook?
log = logging.getLogger("notebook")

# TODO: sar_pyspark.SARpySparkReference looks strange
model = sar_pyspark.SARpySparkReference(spark, remove_seen=True, similarity_type='jaccard', 
                            time_decay_coefficient=30, timedecay_formula=True, 
                            **header)

data = spark.read.csv(data_location, header=True)

print("Total rows: %d" % data.count())

data.limit(5).toPandas()

found movielens1m.csv at movielens1m.csv
Total rows: 227472


Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,68646,10,1381620027
1,1,113277,10,1379466669
2,2,454876,8,1394818630
3,2,790636,7,1389963947
4,2,816711,8,1379963769


In [3]:
# split into two spark dataframes for training and testing
train, test = data.randomSplit([0.8, 0.2], seed=12)

print("Train: %9d" % train.count())
print("Test:  %9d" % test.count())
# left outer join

test.createOrReplaceTempView("df_train")
test.createOrReplaceTempView("df_test")

query = """
SELECT df_test.*
FROM
    df_test JOIN
    (SELECT DISTINCT {col_user} FROM df_train) customers
    ON df_test.{col_user} = customers.{col_user}
CLUSTER BY {col_user}
""".format(col_user = header['col_user'])

test = spark.sql(query)
test.write.mode("overwrite").saveAsTable("df_test_input")
print("Test:  %9d (w/o cold users)" % test.count())

Train:    181769
Test:      45703
Test:      45703 (w/o cold users)


In [None]:
%%time

model._fit(train)

# print(model.scores.count())

# uncomment if you want to predict just the SAR scores for user-item pairs which are in the test
# set only
# predictions = model.predict(test_indexed)
top_k = model._recommend_k_items(test, top_k=10)

top_k.count()

INFO:utilities.recommender.sar.sar_pyspark:Collecting user affinity matrix...
INFO:utilities.recommender.sar.sar_pyspark:Calculating item cooccurrence...
INFO:utilities.recommender.sar.sar_pyspark:Calculating item similarity...
INFO:utilities.recommender.sar.sar_pyspark:Running query -- 
            SELECT i1, i2, value / (M1.margin + M2.margin - value) AS value
            FROM item_cooccurrence A 
                INNER JOIN item_marginal M1 ON A.i1 = M1.i 
                INNER JOIN item_marginal M2 ON A.i2 = M2.i
            
INFO:utilities.recommender.sar.sar_pyspark:Calculating recommendation scores...


Similarity size: 2479043
