In [2]:
# set the environment path to find Recommenders
import sys
sys.path.append("..")

from utilities.recommender.sar import sar_pyspark
from pyspark.sql import SparkSession
from utilities.common.file_utils import maybe_download
import pyspark.sql.functions as F
import logging
import numpy as np
import pandas as pd

# needs JDK 1.8 on Windows with pySpark 2.3.1

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

spark = SparkSession \
    .builder \
    .appName("Sample1") \
    .master("local[*]") \
    .config("spark.sql.shuffle.partitions", "1") \
    .getOrCreate()
    
spark

In [5]:
%%time

header = {
    'col_user': "customerID",
    'col_item': "itemID",
    'col_rating': "rating",
    'col_timestamp': "Timestamp"
}

d_alnum_train = {
'customerID': ['ua','ua','ua','ub','ub','uc','uc'],
'itemID':     ['ia','ib','ic','id','ie','if','ia'],
'rating':     [5,5,5,1,1,3,5]
}
pdf_train = pd.DataFrame(d_alnum_train)
d_alnum_test = {
'customerID': ['ua','ua','ub','ub','uc','uc'],
'itemID':     ['id','ie','ia','ie','if','ia'],
'rating':     [1,1,5,5,5,5]
}
pdf_test = pd.DataFrame(d_alnum_test)

import pyspark.sql.functions as F
df = spark.createDataFrame(pdf_train).withColumn("type", F.lit(1))
df_test = spark.createDataFrame(pdf_test).withColumn("type", F.lit(0))

model = sar_pyspark.SARpySparkReference(spark, remove_seen=True, similarity_type='jaccard', 
                            time_decay_coefficient=30, timedecay_formula=False, 
                            **header)

model.fit(df)
model.scores.show()

top_k_items = model.recommend_k_items(df_test, 2)
top_k_items.show()

+------+------+-----+
|userID|itemID|score|
+------+------+-----+
|    ua|    ia| 10.0|
|    ua|    ib| 12.5|
|    ua|    ic| 12.5|
|    ua|    if|  2.5|
|    ub|    id|  2.0|
|    ub|    ie|  2.0|
|    uc|    ia|  6.5|
|    uc|    if|  5.5|
|    uc|    ib|  2.5|
|    uc|    ic|  2.5|
+------+------+-----+

+------+------+-----+
|userID|itemID|score|
+------+------+-----+
|    ua|    ib| 12.5|
|    ua|    ic| 12.5|
|    ub|    id|  2.0|
|    uc|    ib|  2.5|
|    uc|    ic|  2.5|
+------+------+-----+

Wall time: 12.6 s


In [2]:
# df_test.show()
spark.sql("SELECT * FROM top_scores_full").show()

+------+------+-----+
|userID|itemID|score|
+------+------+-----+
|    ua|    ic| 12.5|
|    ua|    if|  2.5|
|    ua|    ib| 12.5|
|    ua|    ia| 10.0|
|    uc|    ic|  2.5|
|    uc|    ib|  2.5|
|    uc|    ia|  6.5|
|    uc|    if|  5.5|
|    ub|    ie|  2.0|
|    ub|    id|  2.0|
+------+------+-----+



In [4]:
# df_test.show()
spark.sql(""" 
                SELECT ts.*, df_test.itemID existingItemID
                FROM top_scores_full ts LEFT OUTER JOIN df_test
                    ON ts.userID = df_test.customerID AND ts.itemID = df_test.itemID
""").show()

+------+------+-----+--------------+
|userID|itemID|score|existingItemID|
+------+------+-----+--------------+
|    uc|    ic|  2.5|          null|
|    uc|    ib|  2.5|          null|
|    ua|    ic| 12.5|          null|
|    ub|    ie|  2.0|            ie|
|    ua|    if|  2.5|          null|
|    ua|    ib| 12.5|          null|
|    ub|    id|  2.0|          null|
|    uc|    ia|  6.5|            ia|
|    ua|    ia| 10.0|          null|
|    uc|    if|  5.5|            if|
+------+------+-----+--------------+

