In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [4]:
sc =SparkContext()

In [5]:
spark = SparkSession(sc)

In [10]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.json("Du lieu cung cap/reviews_Toys_and_Games_5.json.gz")

In [12]:
data.show(5,truncate=True)

+----------+-------+-------+--------------------+-----------+--------------+--------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|  reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+--------------+--------------------+--------------+
|0439893577| [0, 0]|    5.0|I like the item p...|01 29, 2014|A1VXOAVRGKGEAK|         Angie|      Magnetic board|    1390953600|
|0439893577| [1, 1]|    4.0|Love the magnet e...|03 28, 2014| A8R62G708TSCM|       Candace|it works pretty g...|    1395964800|
|0439893577| [1, 1]|    5.0|Both sides are ma...|01 28, 2013|A21KH420DK0ICA|capemaychristy|          love this!|    1359331200|
|0439893577| [0, 0]|    5.0|Bought one a few ...| 02 8, 2014| AR29QK6HPFYZ4|          dcrm|   Daughters love it|    1391817600|
|0439893577| [1, 1]|    4.0|I have a stainles...| 05 5, 2014| ACCH8EOML6FN5|          DoyZ|Great to have

In [13]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [14]:
data_sub.count()

167597

In [15]:
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import isnan, when, count, col

In [16]:
data_sub.show(5, truncate=True)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|0439893577|    5.0|A1VXOAVRGKGEAK|
|0439893577|    4.0| A8R62G708TSCM|
|0439893577|    5.0|A21KH420DK0ICA|
|0439893577|    5.0| AR29QK6HPFYZ4|
|0439893577|    4.0| ACCH8EOML6FN5|
+----------+-------+--------------+
only showing top 5 rows



In [17]:
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in
                 data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [18]:
# Distinct users and movies
users = data_sub.select("reviewerID").distinct().count()
products = data_sub.select("asin").distinct().count()
numerator = data_sub.count()

In [19]:
display(numerator, users, products)

167597

19412

11924

In [20]:
# Number of ratings matrix could contain if no empty cells
denominator = users * products
denominator

231468688

In [21]:
#Calculating sparsity
sparsity = 1 - (numerator*1.0 / denominator)
print ("Sparsity: "), sparsity

Sparsity: 


(None, 0.9992759409428199)

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [23]:
# Converting String to index
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col


In [27]:
# Create an indexer
indexer = StringIndexer(inputCol='asin',  outputCol='asin_idx')
# Indexer identifies categories in the data
indexer_model = indexer.fit(data_sub)
# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data_sub)
# Repeat the process for the other categorical feature
indexer1 = StringIndexer(inputCol='reviewerID', outputCol='reviewerID_idx')
indexer1_model = indexer1.fit(data_indexed)
data_indexed = indexer1_model.transform(data_indexed)

In [28]:
data_indexed.show(5, truncate=True)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|0439893577|    5.0|A1VXOAVRGKGEAK|  2524.0|       14349.0|
|0439893577|    4.0| A8R62G708TSCM|  2524.0|       18115.0|
|0439893577|    5.0|A21KH420DK0ICA|  2524.0|        4454.0|
|0439893577|    5.0| AR29QK6HPFYZ4|  2524.0|       18990.0|
|0439893577|    4.0| ACCH8EOML6FN5|  2524.0|        2769.0|
+----------+-------+--------------+--------+--------------+
only showing top 5 rows



In [26]:
data_indexed.select([count(when(col(c).isNull(), c)).alias(c) for c in data_indexed.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0
asin_idx,0
reviewerID_idx,0


In [31]:
# Smaller dataset so we will use 0.8 / 0.2
(training, test) = data_indexed.randomSplit([0.8, 0.2])

In [32]:
# Creating ALS model and fitting data
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [33]:
als = ALS(maxIter=5, 
          regParam=0.09, 
          rank = 25,
          userCol="reviewerID_idx", 
          itemCol="asin_idx", 
          ratingCol="overall", 
          coldStartStrategy="drop",
          nonnegative=True)
model = als.fit(training)

In [34]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [35]:
predictions.select(["asin_idx", "reviewerID_idx", 
                    "overall", "prediction"]).show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|  8043.0|       13285.0|    5.0|  3.567395|
|  4738.0|        1088.0|    5.0| 3.8006392|
|  2530.0|        2659.0|    4.0| 3.8889747|
|  1906.0|         148.0|    5.0| 5.2485595|
|  1703.0|        1238.0|    5.0|  4.703676|
+--------+--------------+-------+----------+
only showing top 5 rows



In [36]:
evaluator = RegressionEvaluator(metricName="rmse",  
                                labelCol="overall",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1185318181554769


In [None]:
# On average, this model is ~ 1.2 from perfect recommendations.

### Providing Recommendations: for all users

In [37]:
# get 20 recommendations which have highest rating.
user_recs = model.recommendForAllUsers(20)

In [38]:
for user in user_recs.head(5):
    print(user)
    print("\n")

Row(reviewerID_idx=28, recommendations=[Row(asin_idx=11132, rating=5.692352771759033), Row(asin_idx=10791, rating=5.682418346405029), Row(asin_idx=4625, rating=5.561408996582031), Row(asin_idx=9330, rating=5.5600972175598145), Row(asin_idx=8905, rating=5.490021228790283), Row(asin_idx=9518, rating=5.42319917678833), Row(asin_idx=7858, rating=5.411526679992676), Row(asin_idx=7599, rating=5.406836986541748), Row(asin_idx=5296, rating=5.403281211853027), Row(asin_idx=10928, rating=5.4004011154174805), Row(asin_idx=10731, rating=5.395941734313965), Row(asin_idx=10644, rating=5.393740177154541), Row(asin_idx=7867, rating=5.390657424926758), Row(asin_idx=6836, rating=5.357697010040283), Row(asin_idx=11525, rating=5.3529052734375), Row(asin_idx=8158, rating=5.349387168884277), Row(asin_idx=11601, rating=5.347063064575195), Row(asin_idx=11870, rating=5.315193176269531), Row(asin_idx=5074, rating=5.301178932189941), Row(asin_idx=6696, rating=5.295960903167725)])


Row(reviewerID_idx=31, recomme

### Converting back to string form


In [39]:
import pandas as pd
recs=model.recommendForAllUsers(10).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
 .merge(recs, right_index = True, left_index = True) \
 .drop(["recommendations"], axis = 1) \
 .melt(id_vars = ['reviewerID_idx'], value_name = "recommendation") \
 .drop("variable", axis = 1) \
 .dropna() 
nrecs=nrecs.sort_values('reviewerID_idx')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), 
 nrecs['reviewerID_idx']], axis = 1)
nrecs.columns = [ 
 'ProductID_index',
 'Rating',
 'UserID_index' 
 ]

In [41]:
md=data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx'])
md=md.toPandas()
dict1=dict(zip(md['reviewerID_idx'],md['reviewerID']))
dict2=dict(zip(md['asin_idx'],md['asin']))
nrecs['reviewerID']=nrecs['UserID_index'].map(dict1)
nrecs['asin']=nrecs['ProductID_index'].map(dict2)
nrecs=nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['reviewerID','asin','Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res=new[['reviewerID','recommendations']] 
res_new=res['recommendations'].groupby([res.reviewerID]).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new.asin, new.Rating))


In [42]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A012468118FTQAINEI0OQ,"[(B00946ZOT8, 6.211315631866455), (B0007XIZ0C,..."
1,A0182108CPDLPRCXQUZQ,"[(B007WYU7R8, 4.920167922973633), (B0039KIW9E,..."
2,A026961431MGW0616BRS3,"[(B0042TBGC6, 4.5203471183776855), (B0037UT1RI..."
3,A034597326Z83X79S50FI,"[(B00G69PTKK, 6.04032564163208), (B002LE8YYW, ..."
4,A04295422T2ZG087R17FX,"[(B0001Y6IIS, 4.8696393966674805), (B00E1B7LV0..."
...,...,...
19404,AZZLI36GZV6ZD,"[(B00ECDRBHQ, 5.5645599365234375), (B0001Y6IIS..."
19405,AZZOZQ8AAHT64,"[(B0001Y6IIS, 5.636949062347412), (B0001VUYUE,..."
19406,AZZT1ERHBSNQ8,"[(B00EVEMMRU, 6.476414203643799), (B009Y943F6,..."
19407,AZZTH6DJ0KSIP,"[(B004I0FVHA, 5.683483600616455), (B003AUD9BE,..."
