In [1]:
import findspark
findspark.init()


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Recommendation_Beauty').getOrCreate()

In [4]:
data = spark.read.json("Beauty_5.json")

In [5]:
data.show(5,truncate=True)

+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|7806397051| [3, 4]|    1.0|Very oily and cre...|01 30, 2014|A1YJEY40YUW4SE|      Andrea|Don't waste your ...|    1391040000|
|7806397051| [1, 1]|    3.0|This palette was ...|04 18, 2014| A60XNB876KYML|  Jessica H.|         OK Palette!|    1397779200|
|7806397051| [0, 1]|    4.0|The texture of th...| 09 6, 2013|A3G6XNM240RMWA|       Karen|       great quality|    1378425600|
|7806397051| [2, 2]|    2.0|I really can't te...| 12 8, 2013|A1PQFP6SAJ6D80|       Norah|Do not work on my...|    1386460800|
|7806397051| [0, 0]|    3.0|It was a little s...|10 19, 2013|A38FVHZTNQ271F|   Nova Amor|          It's okay.|    1382

In [6]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [7]:
data_sub.count()

198502

In [8]:
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import isnan, when, count, col

In [9]:
data_sub.show(5, truncate=True)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|7806397051|    1.0|A1YJEY40YUW4SE|
|7806397051|    3.0| A60XNB876KYML|
|7806397051|    4.0|A3G6XNM240RMWA|
|7806397051|    2.0|A1PQFP6SAJ6D80|
|7806397051|    3.0|A38FVHZTNQ271F|
+----------+-------+--------------+
only showing top 5 rows



In [10]:
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in
                 data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [11]:
# Distinct users and movies
users = data_sub.select("reviewerID").distinct().count()
products = data_sub.select("asin").distinct().count()
numerator = data_sub.count()

In [12]:
display(numerator, users, products)

198502

22363

12101

In [13]:
# Number of ratings matrix could contain if no empty cells
denominator = users * products
denominator

270614663

In [14]:
#Calculating sparsity
sparsity = 1 - (numerator*1.0 / denominator)
print ("Sparsity: "), sparsity

Sparsity: 


(None, 0.9992664772935825)

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [16]:
# Converting String to index
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [17]:
# Create an indexer
indexer = StringIndexer(inputCol='asin', 
 outputCol='asin_idx')
# Indexer identifies categories in the data
indexer_model = indexer.fit(data_sub)
# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data_sub)
# Repeat the process for the other categorical feature
indexer1 = StringIndexer(inputCol='reviewerID', 
 outputCol='reviewerID_idx')
indexer1_model = indexer1.fit(data_indexed)
data_indexed = indexer1_model.transform(data_indexed)

In [18]:
data_indexed.show(5, truncate=True)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|7806397051|    1.0|A1YJEY40YUW4SE|  6194.0|       16983.0|
|7806397051|    3.0| A60XNB876KYML|  6194.0|       10399.0|
|7806397051|    4.0|A3G6XNM240RMWA|  6194.0|        5985.0|
|7806397051|    2.0|A1PQFP6SAJ6D80|  6194.0|       11765.0|
|7806397051|    3.0|A38FVHZTNQ271F|  6194.0|        5910.0|
+----------+-------+--------------+--------+--------------+
only showing top 5 rows



In [19]:
data_indexed.select([count(when(col(c).isNull(), c)).alias(c) for c in
                     data_indexed.columns]).toPandas().T


Unnamed: 0,0
asin,0
overall,0
reviewerID,0
asin_idx,0
reviewerID_idx,0


In [20]:
# Smaller dataset so we will use 0.8 / 0.2
(training, test) = data_indexed.randomSplit([0.8, 0.2])

In [21]:
# Creating ALS model and fitting data
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [22]:
als = ALS(maxIter=5, 
 regParam=0.09, 
 rank = 25,
 userCol="reviewerID_idx", 
 itemCol="asin_idx", 
 ratingCol="overall", 
 coldStartStrategy="drop",
 nonnegative=True)
model = als.fit(training)

In [23]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [24]:
predictions.select(["asin_idx", "reviewerID_idx", 
                    "overall", "prediction"]).show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|   993.0|        7880.0|    5.0|   3.48921|
|  3992.0|        1829.0|    5.0| 2.1406472|
|  4896.0|        8086.0|    5.0| 4.7142878|
|  2929.0|       16503.0|    1.0|  3.712359|
|  2136.0|       13623.0|    5.0| 3.4046779|
+--------+--------------+-------+----------+
only showing top 5 rows



In [25]:
evaluator = RegressionEvaluator(metricName="rmse", 
 labelCol="overall",
 predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.3473034170704994


In [26]:
# On average, this model is ~ 1.35 from perfect recommendations.

### Providing Recommendations: for all users


In [27]:
# get 20 recommendations which have highest rating.
user_recs = model.recommendForAllUsers(20) 

In [28]:
for user in user_recs.head(5):
    print(user)
    print("\n")

Row(reviewerID_idx=28, recommendations=[Row(asin_idx=7372, rating=6.619761943817139), Row(asin_idx=11623, rating=6.41925048828125), Row(asin_idx=7721, rating=6.409900665283203), Row(asin_idx=4508, rating=6.317165374755859), Row(asin_idx=8940, rating=6.299079895019531), Row(asin_idx=5009, rating=6.297138690948486), Row(asin_idx=10394, rating=6.292103290557861), Row(asin_idx=11205, rating=6.254537105560303), Row(asin_idx=5115, rating=6.230419635772705), Row(asin_idx=8701, rating=6.213785648345947), Row(asin_idx=5972, rating=6.210915565490723), Row(asin_idx=10457, rating=6.19164514541626), Row(asin_idx=7099, rating=6.182777404785156), Row(asin_idx=11044, rating=6.175568103790283), Row(asin_idx=11525, rating=6.173839092254639), Row(asin_idx=10540, rating=6.166039943695068), Row(asin_idx=11653, rating=6.1565985679626465), Row(asin_idx=4771, rating=6.154527187347412), Row(asin_idx=10838, rating=6.141219615936279), Row(asin_idx=9199, rating=6.138522148132324)])


Row(reviewerID_idx=31, recomm

### Converting back to string form


In [29]:
import pandas as pd
recs=model.recommendForAllUsers(10).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
 .merge(recs, right_index = True, left_index = True) \
 .drop(["recommendations"], axis = 1) \
 .melt(id_vars = ['reviewerID_idx'], value_name = "recommendation") \
 .drop("variable", axis = 1) \
 .dropna() 
nrecs=nrecs.sort_values('reviewerID_idx')
nrecs=pd.concat([nrecs['recommendation']\
 .apply(pd.Series), nrecs['reviewerID_idx']], axis = 1)
nrecs.columns = [ 
 'ProductID_index',
 'Rating',
 'UserID_index' 
 ]

In [30]:
md=data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx'])
md=md.toPandas()
dict1 =dict(zip(md['reviewerID_idx'],md['reviewerID']))
dict2=dict(zip(md['asin_idx'],md['asin']))
nrecs['reviewerID']=nrecs['UserID_index'].map(dict1)
nrecs['asin']=nrecs['ProductID_index'].map(dict2)
nrecs=nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['reviewerID','asin','Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res=new[['reviewerID','recommendations']] 
res_new=res['recommendations'].groupby([res.reviewerID]).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new.asin, new.Rating))


In [31]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00414041RD0BXM6WK0GX,"[(B000VOHH56, 4.343129634857178), (B000QAVLT8,..."
1,A00473363TJ8YSZ3YAGG9,"[(B001AL2CNG, 3.9331510066986084), (B00G0EJYFW..."
2,A00700212KB3K0MVESPIY,"[(B000XE961C, 6.301753520965576), (B001H3JQ0E,..."
3,A0078719IR14X3NNUG0F,"[(B000TD2QXC, 8.510390281677246), (B000P7X1MC,..."
4,A01198201H0E3GHV2Z17I,"[(B006L6U0JI, 5.881620407104492), (B0009MMK5M,..."
...,...,...
22355,AZZNK89PXD006,"[(B0012NQBQG, 3.8481853008270264), (B00384TLQE..."
22356,AZZQXL8VDCFTV,"[(B000OY3WNO, 5.566639423370361), (B001F51TS6,..."
22357,AZZT1ERHBSNQ8,"[(B000PKZFTQ, 6.2380266189575195), (B001CJJ5QS..."
22358,AZZU6NXB8YJN9,"[(B000QAVLT8, 5.407864093780518), (B000P7X1MC,..."
