In [10]:
import pyspark
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, hash
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [11]:
spark = SparkSession.builder.appName('collaborative filtering').getOrCreate()
spark = SparkSession.builder\
    .config('spark.driver.memory', '16g')\
    .config('spark.executor.memory','16g')\
    .config('spark.driver.maxResultSize','0')\
    .config('spark.sql.autoBroadcastJoinThreshold','-1')\
    .config('spark.sql.broadcastTimeout','1200')\
    .config('spark.default.parallelism','8')\
    .appName('collaborative filtering').getOrCreate()

########## create sql context ##########
spark_conf = pyspark.SparkConf()
spark_context = pyspark.SparkContext
sc = spark_context.getOrCreate()
sqlContext = SQLContext(sc) 


In [None]:
data_dir = ""
# data_dir = "E:\\courses\\sem4\\CS5344\\Final\\"
business = sqlContext.read.json(data_dir+"yelp_academic_dataset_business.json")\
    .select("business_id", "name", hash("business_id").alias("business_index"))
business.take(5)

In [13]:
rating = sqlContext.read.json(data_dir+"yelp_academic_dataset_review.json")\
    .select("user_id", "business_id", "stars", hash("user_id").alias("user_index"))
rating = rating.join(business, ["business_id"], "left")
rating.take(5)

[Row(business_id='-36nnCT71XE0InJXKBIpuw', user_id='5hjAPltdXiHM3Ng4iw94_Q', stars=1.0, user_index=-1768349507, name='Safeway', business_index=-1364680400),
 Row(business_id='-36nnCT71XE0InJXKBIpuw', user_id='WiVyzMjKiOOyF2kK1_pqjw', stars=5.0, user_index=-175602607, name='Safeway', business_index=-1364680400),
 Row(business_id='-36nnCT71XE0InJXKBIpuw', user_id='tU52QQnMGbJA8o89zYAw2w', stars=1.0, user_index=-538754549, name='Safeway', business_index=-1364680400),
 Row(business_id='-36nnCT71XE0InJXKBIpuw', user_id='Xg7RVVk9zNXTXyvCZB5olA', stars=1.0, user_index=-1633983391, name='Safeway', business_index=-1364680400),
 Row(business_id='-36nnCT71XE0InJXKBIpuw', user_id='wKBjsz3O0DYzXIxr8UQ5dQ', stars=2.0, user_index=-649294825, name='Safeway', business_index=-1364680400)]

In [14]:
def get_sparsity(rating):
    # Count the total number of ratings in the dataset
    count_nonzero = rating.select("stars").count()

    # Count the number of distinct userIds and distinct movieIds
    total_elements = rating.select("user_id").distinct().count() * rating.select("business_id").distinct().count()
    # Divide the numerator by the denominator
    sparsity = (1.0 - (count_nonzero *1.0)/total_elements)*100
    print("The rating dataframe is", "%.6f" % sparsity + "% sparse.")
get_sparsity(rating)

The rating dataframe is 99.997544% sparse.


In [None]:
(train, test) = rating.randomSplit([0.8, 0.2], seed = 2020)
# Create ALS model
als = ALS(
         userCol="user_index", 
         itemCol="business_index",
         ratingCol="stars", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="stars", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
#Fit cross validator to the 'train' dataset
model = cv.fit(train)
#Extract best model from the cv model above
best_model = model.bestModel
best_model.write().overwrite().save("cf_model")

# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

nrecommendations = best_model.recommendForAllUsers(10)
# nrecommendations = nrecommendations\
#     .withColumn("rec_exp", explode("recommendations"))\
#     .select('userId', col("rec_exp.movieId"), col("rec_exp.stars"))

nrecommendations.limit(10).show()

Num models to be tested:  16
