In [1]:
import os
import sys
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
#add the following two in order to use avg
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import lit
from pyspark.sql.functions import desc
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

In [2]:
# Initialize a spark session.
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [3]:
spark = init_spark() # Initializate spark

In [4]:
#reading the dataset and spilling on pyspark dataframe
path = 'data/yelp_academic_dataset_review.json'
df_review = spark.read.json(path).limit(100000)
# df_review = spark.read.json(path)

In [5]:
# from pyspark.sql.functions import col,isnan, when, count
# df_review.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_review.columns]
#    ).show()

In [6]:
#we need userId, businessId and ratings for ALS
df_review = df_review[['user_id','business_id','stars']]

In [8]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df_review) 
            for column in list(set(df_review.columns)-set(['stars'])) ]


pipeline = Pipeline(stages=indexers)
df_review = pipeline.fit(df_review).transform(df_review)

df_review.show()

+--------------------+--------------------+-----+-------------+-----------------+
|             user_id|         business_id|stars|user_id_index|business_id_index|
+--------------------+--------------------+-----+-------------+-----------------+
|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|      10043.0|           1224.0|
|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|       7020.0|            658.0|
|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|      21783.0|           1894.0|
|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|      50901.0|            562.0|
|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|       3085.0|           2701.0|
|eUta8W_HdHMXPzLBB...|04UD14gamNjLY0IDY...|  1.0|      56584.0|            364.0|
|r3zeYsv1XFBRA4dJp...|gmjsEdUsKpj9Xxu6p...|  5.0|      69889.0|            153.0|
|yfFzsLmaWF2d4Sr0U...|LHSTtnW3YHCeUkRDG...|  5.0|      77959.0|            835.0|
|wSTuiTk-sKNdcFypr...|B5XSoSG3SfvQGtKEG...|  3.0|      75577.0|           4904.0|
|59MxRhNVhU9MYnd

In [9]:
df_review.select('user_id').distinct().count()

79345

In [10]:
(df_train, df_test) = df_review.randomSplit([0.8, 0.2], seed=0)

In [11]:
#create evaluator object
eval_rmse = RegressionEvaluator(metricName="rmse", labelCol="stars", predictionCol="prediction")
eval_mae = RegressionEvaluator(metricName="mae", labelCol="stars", predictionCol="prediction")
maxIter = [8,10,13,17]
regParam = [0.2,0.3,0.4,0.5]
rank = [10,20,70]
#create the als model
for mi in maxIter:
    for rp in regParam:
        for r in rank:
            als = ALS(maxIter=mi, regParam=rp, rank=r, userCol="user_id_index", 
                      itemCol="business_id_index", 
                      ratingCol="stars",seed=0, nonnegative=True, coldStartStrategy="drop")
            #fit the model on training set
            recom_model = als.fit(df_train) 
            prediction = recom_model.transform(df_test)
            rmse = eval_rmse.evaluate(prediction)
            mae = eval_mae.evaluate(prediction)
#             print('max iter:',mi, ' regParam:', rp, ' rank:', r, ' rmse:',rmse)
            print('max iter: %d  regParam: %.1f  rank: %d  rmse: %.2f  mae: %.2f'%(mi,rp,r,rmse,mae))

max iter: 8  regParam: 0.2  rank: 10  rmse: 1.85  mae: 1.51
max iter: 8  regParam: 0.2  rank: 20  rmse: 1.85  mae: 1.53
max iter: 8  regParam: 0.2  rank: 70  rmse: 1.85  mae: 1.54
max iter: 8  regParam: 0.3  rank: 10  rmse: 1.80  mae: 1.46
max iter: 8  regParam: 0.3  rank: 20  rmse: 1.80  mae: 1.49
max iter: 8  regParam: 0.3  rank: 70  rmse: 1.82  mae: 1.50
max iter: 8  regParam: 0.4  rank: 10  rmse: 1.79  mae: 1.46
max iter: 8  regParam: 0.4  rank: 20  rmse: 1.80  mae: 1.48
max iter: 8  regParam: 0.4  rank: 70  rmse: 1.81  mae: 1.50
max iter: 8  regParam: 0.5  rank: 10  rmse: 1.80  mae: 1.48
max iter: 8  regParam: 0.5  rank: 20  rmse: 1.81  mae: 1.50
max iter: 8  regParam: 0.5  rank: 70  rmse: 1.82  mae: 1.52
max iter: 10  regParam: 0.2  rank: 10  rmse: 1.80  mae: 1.47
max iter: 10  regParam: 0.2  rank: 20  rmse: 1.80  mae: 1.48
max iter: 10  regParam: 0.2  rank: 70  rmse: 1.80  mae: 1.49
max iter: 10  regParam: 0.3  rank: 10  rmse: 1.76  mae: 1.43
max iter: 10  regParam: 0.3  rank: 2