# Using ALS 

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommendation').getOrCreate()

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [3]:
from pyspark.sql.types import *


In [4]:
data = spark.read.option("delimiter", ";").option("header", "true").csv('./BX-Book-Ratings.csv')



In [5]:
ratings_df = data.withColumn("User-ID", data['User-ID'].cast(IntegerType())).withColumn("ISBN", data['ISBN'].cast(IntegerType())).withColumn("Book-Rating",data['Book-Rating'].cast(IntegerType())).na.drop()
ratings_df.show()

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276726| 155061224|          5|
| 276727| 446520802|          0|
| 276729| 521795028|          6|
| 276733|2080674722|          0|
| 276737| 600570967|          6|
| 276745| 342310538|         10|
| 276746| 425115801|          0|
| 276746| 449006522|          0|
| 276746| 553561618|          0|
| 276746| 786013990|          0|
| 276746| 786014512|          0|
| 276747|  60517794|          9|
| 276747| 451192001|          0|
| 276747| 609801279|          0|
| 276747| 671537458|          9|
| 276747| 679776818|          8|
| 276747| 943066433|          7|
| 276747|1570231028|          0|
| 276747|1885408226|          7|
| 276748| 747558167|          6|
+-------+----------+-----------+
only showing top 20 rows



In [6]:
smaller = ratings_df.limit(100000)


In [7]:
smaller.count()

100000

In [8]:
(train, test) = smaller.randomSplit([0.8, 0.2], seed = 19133025)


In [9]:
## đếm các giá trị null
from pyspark.sql.functions import col,sum
data.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in data.columns)).show()

+-------+----+-----------+
|User-ID|ISBN|Book-Rating|
+-------+----+-----------+
|      0|   0|          0|
+-------+----+-----------+



In [10]:
# Create ALS model
als = ALS(
         userCol='User-ID', 
        itemCol='ISBN', 
        ratingCol='Book-Rating', 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [11]:
# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
parameter = ParamGridBuilder() \
                    .addGrid(als.rank, [ 8, 12]) \
                    .addGrid(als.maxIter, [5,10]) \
                    .addGrid(als.regParam, [0.01,0.001]) \
                    .addGrid(als.alpha, [2.0,3.0]) \
                    .build()

In [13]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="Book-Rating", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(parameter))

Num models to be tested:  16


In [14]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=parameter, evaluator=evaluator, numFolds=3)

In [16]:
model = cv.fit(train)

In [17]:
best_model = model.bestModel

In [18]:
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

4.725624851872248


In [25]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())


**Best Model**
  Rank: 12
  MaxIter: 10
  RegParam: 0.01


In [20]:
als = ALS(maxIter=10, regParam=0.01, rank=12,
         userCol='User-ID', 
        itemCol='ISBN', 
        ratingCol='Book-Rating', 
         nonnegative = True, 
         implicitPrefs = True,
         coldStartStrategy="drop"
)

In [21]:
(train, test) = ratings_df.randomSplit([0.8, 0.2], seed = 19133025)


In [22]:
model = als.fit(train)


In [23]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Book-Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 4.6072299190911545
