In [73]:
import findspark 

findspark.init()

In [93]:
from pyspark import SparkContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

import math
import time

In [94]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("recommendation").getOrCreate()

In [95]:
data = spark.read.csv("ml-latest-small/ratings.csv", inferSchema=True, header= True)

In [96]:
data.head()

Row(userId=1, movieId=31, rating=2.5)

In [97]:
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [98]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|            100004|            100004|            100004|
|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|
| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|
|    min|                 1|                 1|               0.5|
|    max|               671|            163949|               5.0|
+-------+------------------+------------------+------------------+



In [99]:
(train, validation, test) = data.randomSplit([0.6, 0.2, 0.2], seed =1)

In [82]:
#ALS model on training data
als = ALS(maxIter= 5,
          regParam= 0.01,
          userCol="userId", 
          ratingCol="rating", 
          itemCol="movieId", 
          coldStartStrategy= "drop",
          implicitPrefs= False )# nonnegative=True)

model = als.fit(train_data)

In [100]:
# Grid searching for hyperparameter tuning

def GridSearch(train, valid, num_iteration, reg_param, n_factors):
    min_rmse = float('inf')
    best_n = -1
    best_reg =0
    best_model= None
    
    for n in n_factors:
        for reg in reg_param:
            als = ALS(regParam= reg, 
                      rank=n,
                      userCol="userId", 
                      ratingCol="rating", 
                      itemCol="movieId", 
                      coldStartStrategy= "drop",
                      implicitPrefs= False )
            model = als.fit(train)
            predictions= model.transform(valid)
            evaluator = RegressionEvaluator(metricName= "rmse", labelCol="rating", predictionCol="prediction") #, coldStartStrategy ="drop")
            rmse = evaluator.evaluate(predictions)
            print("{} latent factors and regularization = {}: validation RMSE is {}".format(n, reg, rmse))
            if rmse< min_rmse:
                min_rmse =rmse
                best_n=n
                best_reg= reg
                best_model= model
    
    predictions = best_model.transform(valid)
    evaluator = RegressionEvaluator(metricName= "rmse", labelCol="rating", predictionCol="prediction") #, coldStartStrategy ="drop")
    train_rmse = evaluator.evaluate(predictions)  
    print("\nThe best model has {} latent factors and regularization = {}:".format(best_n, best_reg))
    print("traning RMSE is {}; validation RMSE is {}".format(train_rmse, min_rmse))
    return best_model


    


In [89]:
num_iterations =10
ranks = [6,8,10,12]
reg_params = [0.05, 0.1, 0.2, 0.4, 0.8]

start_time= time.time()
final_model = GridSearch(train,validation, num_iterations, reg_params, ranks)
print("Total Runtime: {:.2f} seconds".format(time.time() - start_time ))

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "F:\spark\python\pyspark\ml\wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'ALS' object has no attribute '_java_obj'


6 latent factors and regularization = 0.05: validation RMSE is 1.0156902904383778
6 latent factors and regularization = 0.1: validation RMSE is 0.9489052379032114
6 latent factors and regularization = 0.2: validation RMSE is 0.9238211690110109
6 latent factors and regularization = 0.4: validation RMSE is 0.9844931017261915
6 latent factors and regularization = 0.8: validation RMSE is 1.1971975376992765
8 latent factors and regularization = 0.05: validation RMSE is 1.0252742255333507
8 latent factors and regularization = 0.1: validation RMSE is 0.9468494634914641
8 latent factors and regularization = 0.2: validation RMSE is 0.9218965688157329
8 latent factors and regularization = 0.4: validation RMSE is 0.9832390784054195
8 latent factors and regularization = 0.8: validation RMSE is 1.1971982257844678
10 latent factors and regularization = 0.05: validation RMSE is 1.0396438319222423
10 latent factors and regularization = 0.1: validation RMSE is 0.9517662191927447
10 latent factors and r

In [101]:
num_iterations = 15
ranks = [7, 8, 9]
reg_params = [0.1, 0.2, 0.3]

final_model = GridSearch(train, validation, num_iterations, reg_params, ranks)

7 latent factors and regularization = 0.1: validation RMSE is 0.9467362219330183
7 latent factors and regularization = 0.2: validation RMSE is 0.9214610351653614
7 latent factors and regularization = 0.3: validation RMSE is 0.9443191949284415
8 latent factors and regularization = 0.1: validation RMSE is 0.9468494634914641
8 latent factors and regularization = 0.2: validation RMSE is 0.9218965688157329
8 latent factors and regularization = 0.3: validation RMSE is 0.9447256172021044
9 latent factors and regularization = 0.1: validation RMSE is 0.9499697339615238
9 latent factors and regularization = 0.2: validation RMSE is 0.9217837059475428
9 latent factors and regularization = 0.3: validation RMSE is 0.9453621227799466

The best model has 7 latent factors and regularization = 0.2:
traning RMSE is 0.9214610351653614; validation RMSE is 0.9214610351653614


In [92]:
#traning with finest model 
pred_test = final_model.transform(test)
evaluator = RegressionEvaluator(metricName= "rmse", labelCol="rating", predictionCol="prediction") #, coldStartStrategy ="drop")
test_rmse = evaluator.evaluate(predictions) 
print("The testing RMSE is "+ str(test_rmse))
print("The testing RMSE is "+ str(RMSE(pred_test)))

The testing RMSE is 1.1201958264099678


NameError: name 'RMSE' is not defined