## Recommendation System using Pyspark

In [24]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
import pyspark.sql.types as T
spark=SparkSession.builder.appName("Recommendation").getOrCreate()

In [25]:
df=spark.read.json('../movies.json')
df.printSchema()

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



In [26]:
df_clean1=df.select(F.col('user_id'),F.col('product_id'),F.col('review'),F.col('score'))
df_clean1.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)



In [27]:
hex_to_big=F.udf(lambda x: int(x,16)%10**8,T.LongType())#converts the hexadecimal to LongType
df_clean2=df_clean1.withColumn("user_id_int",hex_to_big(F.sha1('user_id'))) #sha converts alphanumeric to 40 bit hexadecimal
df_clean2=df_clean2.withColumn("product_id_int",hex_to_big(F.sha1('product_id')))
df_clean2.show(20)
train,test=df_clean2.randomSplit([0.8,0.2])


+--------------+----------+--------------------+-----+-----------+--------------+
|       user_id|product_id|              review|score|user_id_int|product_id_int|
+--------------+----------+--------------------+-----+-----------+--------------+
|A141HP4LYPWMSR|B003AI2VGA|Synopsis: On the ...|  3.0|    5460385|      51259877|
|A328S9RN3U5M68|B003AI2VGA|THE VIRGIN OF JUA...|  3.0|   64843361|      51259877|
|A1I7QGUDP043DG|B003AI2VGA|The scenes in thi...|  5.0|    1480848|      51259877|
|A1M5405JH9THP9|B003AI2VGA|THE VIRGIN OF JUA...|  3.0|   81925650|      51259877|
| ATXL536YX71TR|B003AI2VGA|Informationally, ...|  3.0|   40460580|      51259877|
|A3QYDL5CDNYN66|B003AI2VGA|The murders in Ju...|  2.0|   81231482|      51259877|
| AQJVNDW6YZFQS|B003AI2VGA|Mexican men are m...|  1.0|   70150525|      51259877|
| AD4CDZK7D31XP|B00006HAXW|Over the past few...|  5.0|   97008113|      40695676|
|A3Q4S5DFVPB70D|B00006HAXW|I recvd this vide...|  5.0|   26641678|      40695676|
|A2P7UB02HAVEPB|

In [28]:

als=ALS( maxIter=10,regParam=0.1,userCol='user_id_int',itemCol='product_id_int',ratingCol='score',coldStartStrategy='drop')


In [29]:
paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20]) \
    .addGrid(als.regParam, [0.01, 0.1]) \
    .addGrid(als.maxIter, [5, 10]) \
    .build()


In [30]:
crossval = CrossValidator(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3  # e.g. 3-fold cross-validation
)


In [31]:
model=crossval.fit(train)
#user_input=int(input('Enter user id for prediction'))
#product_input=int(input('Enter product id for prediction'))
#inputdf=spark.createDataFrame([(user_input,product_input)])
#inputdf.show()
predict=model.transform(test)
predict.show(100)

+--------------+----------+--------------------+-----+-----------+--------------+----------+
|       user_id|product_id|              review|score|user_id_int|product_id_int|prediction|
+--------------+----------+--------------------+-----+-----------+--------------+----------+
|A3EOYN87MMNHL1|0790747324|I first viewed th...|  5.0|   16261090|      62577830|0.40065345|
|A19ZXK9HHVRV1X|B00022VM5I|"The Human Stain"...|  3.0|   37666668|      43077444| 2.9135137|
|A2VXLLBBT7CL89|B0001G6PZC|I went to see thi...|  4.0|   19582301|      58302865| 2.7188632|
| ABLOQZIL42W7I|B0016OLXN2|Wow!  This is why...|  5.0|    4272300|      86369239| 1.1787786|
|A2EWC48FRNO3YP|B0001G6PZC|We are huge fans ...|  5.0|   88408752|      58302865|0.94796866|
|A3LYHNFYXYS1RP|B0002V7TJM|For my taste this...|  5.0|   27867313|      61205283| 4.9212995|
|A1FNES0QEBJZD1|B0001G6PZC|I have loved this...|  4.0|   71882095|      58302865|   3.66851|
|A2N7N3OPJSAJJD|B002OHDRF2|T2 is by far one ...|  1.0|   33787831|    

In [32]:

evaluator=RegressionEvaluator()

evaluator.setLabelCol('score').setPredictionCol('prediction').setMetricName('rmse')

rmse=evaluator.evaluate(predict)
print("Root mean Square Error Value:",rmse)

Root mean Square Error Value: 1.8867533073905294
