In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
import hashlib
from pyspark.sql.functions import udf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer

In [2]:
conf = SparkConf().setAppName("test").setMaster("*")
spark = SparkSession.builder.getOrCreate()

In [4]:
def fix_ids(s):
    return int(int(hashlib.sha1(s).hexdigest(), 16) % 15487469)

fix_ids_udf = udf(fix_ids)

def fix_decimal_values(s):
    return round(s,2)

fix_round_udf = udf(fix_decimal_values)

def conv_to_int(v):
    return int(v)

to_int_udf = udf(conv_to_int)

In [5]:
df = spark.read.json('data/dataset/review.json')
rating_df = df.select(df['user_id'],df['stars'].cast('float'),df['business_id'],df['date'].cast('date') )

In [6]:
# indexer = StringIndexer(inputCol="user_id", outputCol="userId")
# indexed = indexer.fit(rating_df).transform(rating_df)
# rating_df = indexed.withColumn('userId',to_int_udf(indexed['userId']))
#rating_df.show()

In [7]:
# indexer = StringIndexer(inputCol="business_id", outputCol="businessId")
# indexed = indexer.fit(rating_df).transform(rating_df)
# rating_df = indexed.withColumn('businessId',to_int_udf(col('businessId')))
#rating_df.show()

In [8]:
# rating_df = rating_df.select('userId','businessId','stars','date')
# rating_df.show(5)

In [9]:
rating_df= rating_df.withColumn('userId',fix_ids_udf(rating_df['user_id']).cast('int'))
rating_df= rating_df.withColumn('businessId',fix_ids_udf(rating_df['business_id']).cast('int'))

In [10]:
(training,test,spill) = rating_df.randomSplit([0.20,0.02,.78])

In [11]:
gavg=training.groupBy().avg().collect()[0][0]
print(gavg)

3.72645666644


In [12]:
training.show(5)

+--------------------+-----+--------------------+----------+--------+----------+
|             user_id|stars|         business_id|      date|  userId|businessId|
+--------------------+-----+--------------------+----------+--------+----------+
|--GwB-sktmoAOPBsb...|  5.0|tMxzAzHSFJWjSu6CE...|2014-08-27| 7702973|   8901694|
|--JSMB52zXJr_LBlk...|  1.0|rNvY082kkM9paQvf3...|2017-06-14|11569471|   1747320|
|--NIc98RMssgy0mSZ...|  3.0|gZvwCOaMhxFXXNvy1...|2017-08-08|11611259|   8465188|
|--NIc98RMssgy0mSZ...|  4.0|tyAv9CRnAOjk-sWEo...|2016-08-04|11611259|   5741675|
|-0AyZxS5C--WySnbW...|  1.0|YtfQQHdcQ_bGl2V0A...|2014-09-28|  114039|   6414367|
+--------------------+-----+--------------------+----------+--------+----------+
only showing top 5 rows



In [15]:
df_user = training.groupby(col('userId')).avg().select(col('userId'),col('avg(stars)'))
df_user = df_user.select(col('userId'),df_user['avg(stars)'].alias('user-mean').cast('float'))
df_item = training.groupby(col('businessId')).avg().select(col('businessId'),col('avg(stars)'))
df_item = df_item.select(col('businessId'),df_item['avg(stars)'].alias('item-mean').cast('float'))

training = training.join(df_user,'userId')
training = training.join(df_item,'businessId')

training = training.withColumn('user-item-interaction',training.stars-(training['user-mean']+\
    training['item-mean']-gavg))

training = training.select(training.userId,training.businessId,training.stars,training['user-mean'],training['item-mean'],\
training['user-item-interaction'].cast('float'))
training = training.withColumn('user-mean',fix_round_udf(training['user-mean']))
training = training.withColumn('user-mean',fix_round_udf(training['item-mean']))
training = training.withColumn('user-mean',fix_round_udf(training['user-item-interaction']))
training.show(20)

+--------+----------+-----+---------+---------+---------------------+
|  userId|businessId|stars|user-mean|item-mean|user-item-interaction|
+--------+----------+-----+---------+---------+---------------------+
|85306473|    272391|  5.0|    -0.23|      5.0|          -0.23134898|
|43941930|    533542|  4.0|    -0.27|      4.0|          -0.27301595|
|58058439|    919571|  1.0|     0.06|      1.0|          0.060317054|
| 5959682|    919571|  1.0|     2.73|      1.0|             2.726984|
|50575816|    919571|  1.0|    -0.02|      1.0|         -0.023015961|
|17809023|    919571|  1.0|     0.73|      1.0|             0.726984|
|84355756|    919571|  1.0|     0.39|      1.0|           0.39365104|
|72852787|   1088310|  5.0|    -1.27|      5.0|            -1.273016|
|45421067|   1166132|  1.0|     2.73|      1.0|             2.726984|
|79393828|   1192456|  2.0|    -0.27|      2.0|          -0.27301595|
|89514075|   1210943|  4.0|     0.52|      4.0|            0.5235939|
|72529602|   1342122

In [20]:
test_user = test.groupby('userId').avg().select('userId','avg(stars)')
test_user = test_user.select('userId',test_user['avg(stars)'].alias('user-mean').cast('float'))
test_item = test.groupby('businessId').avg().select('businessId','avg(stars)')
test_item = test_item.select('businessId',test_item['avg(stars)'].alias('item-mean').cast('float'))
test_df = test.join(test_user,'userId')
df_test = test_df.join(test_item,'businessId')

df_test = df_test.withColumn('user-mean',fix_round_udf(df_test['user-mean']))
df_test = df_test.withColumn('user-mean',fix_round_udf(df_test['item-mean']))
df_test.show(20)

+----------+--------+--------------------+-----+--------------------+----------+---------+---------+
|businessId|  userId|             user_id|stars|         business_id|      date|user-mean|item-mean|
+----------+--------+--------------------+-----+--------------------+----------+---------+---------+
|  60200681|87631795|--NIc98RMssgy0mSZ...|  4.0|xidr6_d3fwKSb_XaB...|2017-08-08|     3.67|3.6666667|
|  54932015|63928832|--xdSgqUJmcvJot-3...|  4.0|64A4CTvJ2uRQVPkJ5...|2013-09-30|     2.89|2.8888888|
|  17399192|60851158|-04zuZ0tQoGpgG49P...|  5.0|KxlRX3ORVZ2R80icu...|2013-05-18|      5.0|      5.0|
|  59385589|87437728|-0Hbf-cgvSsu8749n...|  4.0|Ljknr0VF5Ia2DlTzE...|2016-05-14|     4.33|4.3333335|
|  35747616|98918747|-0SqALqeWmInVftG_...|  4.0|rMrymOj6RcBBddGuO...|2011-10-23|      3.8|      3.8|
|  26310381|89642291|-4BEUkLvHQntN6qPf...|  4.0|55X2pom73IhiP19UF...|2009-06-22|     3.71|3.7142856|
|  88094466|63833796|-57uOzAWlx__p6QlX...|  3.0|aGDo7GDN5YLvpYykp...|2010-12-03|      4.4| 

### rank=40

In [13]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol='stars',
          coldStartStrategy="drop",rank=40)
model = als.fit(training)

In [14]:
predictions = model.transform(test)
predictions.show(5)

+--------------------+-----+--------------------+----------+--------+----------+----------+
|             user_id|stars|         business_id|      date|  userId|businessId|prediction|
+--------------------+-----+--------------------+----------+--------+----------+----------+
|Yej5B4nd8PqpHMQcm...|  2.0|mv_J_UsHqy9CgwWqq...|2007-10-30| 9722800|      1088|0.44882596|
|Isf8G6HPbNqEisKDj...|  5.0|dU-Nt1-LjV9mAgFOV...|2016-08-16| 8833276|    104880|-2.7993865|
|8FWtB83j6xAkDkqka...|  3.0|s-26LoWzHdmX2woBE...|2016-01-10| 2865671|    189488|0.46369928|
|rcTn6KPGYLsG8D8-_...|  5.0|5-DkRuxdG5EzmnOvf...|2015-03-10| 6695219|    310188|  -0.86885|
|kmE8w5Y785eZmodsx...|  3.0|ml9WFLPNAIryG6zQn...|2008-09-02|14200281|    411771|   5.15891|
+--------------------+-----+--------------------+----------+--------+----------+----------+
only showing top 5 rows



In [15]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [17]:
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 3.73802926753


### rank = 30

In [20]:
als_30 = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol='stars',
          coldStartStrategy="drop",rank=30)
model_30 = als_30.fit(training)

In [21]:
predictions_30 = model_30.transform(test)
evaluator_30 = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse_30 = evaluator_30.evaluate(predictions_30)
print("Root-mean-square error = " + str(rmse_30))

Root-mean-square error = 3.90351821367


### rank = 50

In [22]:
als_50 = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol='stars',
          coldStartStrategy="drop",rank=50)
model_50 = als_50.fit(training)

In [23]:
predictions_50 = model_50.transform(test)
evaluator_50 = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse_50 = evaluator_50.evaluate(predictions_50)
print("Root-mean-square error = " + str(rmse_50))

Root-mean-square error = 3.62734765498


### rank = 60

In [24]:
als_60 = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol='stars',
          coldStartStrategy="drop",rank=60)
model_60 = als_60.fit(training)

In [25]:
predictions_60 = model_60.transform(test)
evaluator_60 = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse_60 = evaluator_60.evaluate(predictions_60)
print("Root-mean-square error = " + str(rmse_60))

Root-mean-square error = 3.51783058839


### rank = 70

In [26]:
als_70 = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol='stars',
          coldStartStrategy="drop",rank=70)
model_70 = als_70.fit(training)

In [27]:
predictions_70 = model_70.transform(test)
evaluator_70 = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse_70 = evaluator_70.evaluate(predictions_70)
print("Root-mean-square error = " + str(rmse_70))

Root-mean-square error = 3.48522977122


### rank = 80

In [28]:
als_80 = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol='stars',
          coldStartStrategy="drop",rank=80)
model_80 = als_80.fit(training)

In [29]:
predictions_80 = model_80.transform(test)
evaluator_80 = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse_80 = evaluator_80.evaluate(predictions_80)
print("Root-mean-square error = " + str(rmse_80))

Root-mean-square error = 3.43358260054


### rank = 90

In [30]:
als_90 = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol='stars',
          coldStartStrategy="drop",rank=90)
model_90 = als_90.fit(training)

In [31]:
predictions_90 = model_90.transform(test)
evaluator_90 = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse_90 = evaluator_90.evaluate(predictions_90)
print("Root-mean-square error = " + str(rmse_90))

Root-mean-square error = 3.44112323274
