# Using ALS 

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommendation').getOrCreate()

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [19]:
from pyspark.sql.types import *


In [30]:
data = spark.read.option("header", "true").csv('./Ratings-New.csv')



In [31]:
data.show()

+-------+-----------+-----------+
|User-ID|unique_isbn|Book-Rating|
+-------+-----------+-----------+
| 276726| 0155061224|          5|
| 276729| 052165615X|          3|
| 276729| 0521795028|          6|
| 276744| 038550120X|          7|
| 276747| 0060517794|          9|
| 276747| 0671537458|          9|
| 276747| 0679776818|          8|
| 276747| 0943066433|          7|
| 276747| 1885408226|          7|
| 276748| 0747558167|          6|
| 276751| 3596218098|          8|
| 276754| 0684867621|          8|
| 276755| 0451166892|          5|
| 276762| 0380711524|          5|
| 276762| 3453092007|          8|
| 276772| 0553572369|          7|
| 276772| 3499230933|         10|
| 276772| 3596151465|         10|
| 276774| 3442136644|          9|
| 276786| 8437606322|          8|
+-------+-----------+-----------+
only showing top 20 rows



In [34]:
ratings_df = data.withColumn("User-ID", data['User-ID'].cast(IntegerType())).withColumn("unique_isbn", data['unique_isbn'].cast(IntegerType())).withColumn("Book-Rating",data['Book-Rating'].cast(IntegerType())).na.drop()
ratings_df.show()

+-------+-----------+-----------+
|User-ID|unique_isbn|Book-Rating|
+-------+-----------+-----------+
| 276726|  155061224|          5|
| 276729|  521795028|          6|
| 276747|   60517794|          9|
| 276747|  671537458|          9|
| 276747|  679776818|          8|
| 276747|  943066433|          7|
| 276747| 1885408226|          7|
| 276748|  747558167|          6|
| 276754|  684867621|          8|
| 276755|  451166892|          5|
| 276762|  380711524|          5|
| 276772|  553572369|          7|
| 276788|  345443683|          8|
| 276796|  330332775|          5|
| 276798|    6379702|          5|
| 276800| 1562827898|          7|
| 276804|  440498058|          8|
| 276808|  395547032|         10|
| 276811|  440414121|         10|
| 276814|   91830893|          8|
+-------+-----------+-----------+
only showing top 20 rows



In [23]:
smaller = ratings_df.limit(100000)


In [24]:
smaller.count()

100000

In [35]:
(train, test, validation) = ratings_df.randomSplit([0.6, 0.2, 0.2], seed = 19133025)


In [36]:
## đếm các giá trị null
from pyspark.sql.functions import col,sum
data.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in data.columns)).show()

+-------+-----------+-----------+
|User-ID|unique_isbn|Book-Rating|
+-------+-----------+-----------+
|      0|          0|          0|
+-------+-----------+-----------+



In [47]:
# Create ALS model
als = ALS(
         userCol='User-ID', 
        itemCol='unique_isbn', 
        ratingCol='Book-Rating', 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [48]:
# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [49]:
parameter = ParamGridBuilder() \
                    .addGrid(als.rank, [10]) \
                    .addGrid(als.maxIter, [1, 5, 10]) \
                    .addGrid(als.regParam, [0.01,0.001, 0.1]) \
                    .addGrid(als.alpha, [100.0,300.0, 1000.0, 2000.0]) \
                    .build()

In [50]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="Book-Rating", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(parameter))

Num models to be tested:  36


In [51]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=parameter, evaluator=evaluator, numFolds=3)

In [52]:
model = cv.fit(train)

In [53]:
best_model = model.bestModel

In [54]:
test_predictions = best_model.transform(validation)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

2.4991644099651618


In [57]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())
# Print "Alpha"
print("  Alpha:", best_model._java_obj.parent().getAlpha())

**Best Model**
  Rank: 10
  MaxIter: 10
  RegParam: 0.1
  Alpha: 100.0


In [61]:
als = ALS(maxIter=10, regParam=0.1, rank=10, alpha = 100.0,
         userCol='User-ID', 
        itemCol='unique_isbn', 
        ratingCol='Book-Rating', 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [62]:
model = als.fit(train)


In [63]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Book-Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 2.464188595062904
