## Recommendation System using Pyspark

In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
import pyspark.sql.types as T
spark=SparkSession.builder.appName("Recommendation").getOrCreate()

In [6]:
df=spark.read.json('movies.json')
df.printSchema()

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



In [10]:
df_clean1=df.select(F.col('user_id'),F.col('product_id'),F.col('review'),F.col('score'))
df_clean1.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)



In [33]:
hex_to_big=F.udf(lambda x: int(x,16)%10**8,T.LongType())#converts the hexadecimal to LongType
df_clean2=df_clean1.withColumn("user_id_int",hex_to_big(F.sha1('user_id'))) #sha converts alphanumeric to 40 bit hexadecimal
df_clean2=df_clean2.withColumn("product_id_int",hex_to_big(F.sha1('product_id')))
df_clean2.show(20)



+--------------+----------+--------------------+-----+-----------+--------------+
|       user_id|product_id|              review|score|user_id_int|product_id_int|
+--------------+----------+--------------------+-----+-----------+--------------+
|A141HP4LYPWMSR|B003AI2VGA|Synopsis: On the ...|  3.0|    5460385|      51259877|
|A328S9RN3U5M68|B003AI2VGA|THE VIRGIN OF JUA...|  3.0|   64843361|      51259877|
|A1I7QGUDP043DG|B003AI2VGA|The scenes in thi...|  5.0|    1480848|      51259877|
|A1M5405JH9THP9|B003AI2VGA|THE VIRGIN OF JUA...|  3.0|   81925650|      51259877|
| ATXL536YX71TR|B003AI2VGA|Informationally, ...|  3.0|   40460580|      51259877|
|A3QYDL5CDNYN66|B003AI2VGA|The murders in Ju...|  2.0|   81231482|      51259877|
| AQJVNDW6YZFQS|B003AI2VGA|Mexican men are m...|  1.0|   70150525|      51259877|
| AD4CDZK7D31XP|B00006HAXW|Over the past few...|  5.0|   97008113|      40695676|
|A3Q4S5DFVPB70D|B00006HAXW|I recvd this vide...|  5.0|   26641678|      40695676|
|A2P7UB02HAVEPB|

In [41]:

als=ALS( maxIter=10,regParam=0.1,userCol='user_id_int',itemCol='product_id_int',ratingCol='score')
model=als.fit(df_clean2)
#user_input=int(input('Enter user id for prediction'))
#product_input=int(input('Enter product id for prediction'))
#inputdf=spark.createDataFrame([(user_input,product_input)])
#inputdf.show()
predict=model.transform(df_clean2)
predict.show(100)

+--------------+----------+--------------------+-----+-----------+--------------+----------+
|       user_id|product_id|              review|score|user_id_int|product_id_int|prediction|
+--------------+----------+--------------------+-----+-----------+--------------+----------+
|A2IMLPUXYQJTSY|B00004CQT3|As owners of a Ye...|  5.0|   77910611|      88926809| 4.9416175|
| AOIT2QUH8GRGA|B0071AD95K|The whole Passion...|  5.0|   14385231|      77653941| 4.9412055|
|A2ON9S58W4AZ1Z|B00004CQT4|This is a great m...|  5.0|   10738950|      87356226| 4.9457555|
|A34KFDQ5KBHZA5|B00004CQT3|well,this just go...|  5.0|   97928400|      88926809| 4.9416175|
|A31YQQYX9PU04Z|B0071AD95K|Great encourageme...|  4.0|   32226354|      77653941|  3.952964|
|A29DLKCN8QWO7B|B000063W1R|This is one of my...|  5.0|   49381282|      87115994|  4.864229|
|A2M4H4R4Z7UDZX|B004BH1TN0|I purchased two D...|  5.0|   28985877|       1474096| 4.9314938|
| AD4CDZK7D31XP|B00006HAXW|Over the past few...|  5.0|   97008113|    

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator=RegressionEvaluator()

evaluator.setLabelCol('score').setPredictionCol('prediction').setMetricName('rmse')

rmse=evaluator.evaluate(predict)
print("Root mean Square Error Value:",rmse)

Root mean Square Error Value: 0.20145840066372264
