<a href="https://colab.research.google.com/github/ramilchai/capstone/blob/main/ALS_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import json
import pickle


In [2]:
!pip install pyspark
!pip install openjdk-8-jdk-headless -qq
!pip install mlflow

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 67 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 59.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=3e449059925fcb10bd15ed41f0b040a0aa86f23a61ca63b71645dde656f7fc1d
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2
[31mERROR: Could not find a version that satisfies the requirement openjdk-8-jdk-headless (from versions: none)[0m
[31mERROR: No matching distribution found for openjdk-8-jdk-headless[0m
Collecting mlflow
  Downloading m

In [3]:
df = pd.read_pickle('/content/interact')

In [4]:
df

Unnamed: 0,user_id_num,book_id,rating
29039,0,29056083,4
29038,0,33266253,5
29037,0,667488,4
29041,0,820343,5
29040,0,23734628,5
...,...,...,...
47818,880,603260,3
47841,880,33,4
47840,880,43615,5
47828,880,6356190,4


In [6]:
import pyspark
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import feature
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
# import org.apache.spark.sql.functions.col
# import org.apache.spark.sql.types.IntegerType
# import pyspark.sql.functions.col
from pyspark.sql.types import IntegerType

## Importing the Data

In [7]:
spark = SparkSession\
        .builder\
        .appName('bookrec').config('spark.driver.host', 'localhost')\
        .getOrCreate()

In [8]:
df_sp = spark.createDataFrame(df)

In [9]:
df_sp.dtypes

[('user_id_num', 'bigint'), ('book_id', 'bigint'), ('rating', 'bigint')]

## First Simple Model

In [10]:
(training, test) = df_sp.randomSplit([0.8, 0.2])

In [15]:
als = ALS(maxIter=5,rank=4, regParam=0.01, userCol='user_id_num', itemCol='book_id', ratingCol='rating',
          coldStartStrategy='drop')

In [16]:
fsm_model = als.fit(training)

In [17]:
predictions = fsm_model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))

Root-mean-square error = 1.623987864634006


## Tuning the Model with Cross-validation

In [18]:
als_model = ALS(userCol='user_id_num', itemCol='book_id', 
                ratingCol='rating', coldStartStrategy='drop')

In [19]:
params = ParamGridBuilder()\
          .addGrid(als_model.regParam, [0.01, 0.001, 0.1])\
          .addGrid(als_model.rank, [4, 10, 50]).build()

In [20]:
cv = CrossValidator(estimator=als_model, 
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    parallelism=4)

best_als_model = cv.fit(df_sp)    

In [21]:
best_als_model.bestModel.rank

50

In [26]:
best_als_model.bestModel.__dict__

{'_defaultParamMap': {Param(parent='ALS_60d28e8bdf3b', name='blockSize', doc='block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data.'): 4096,
  Param(parent='ALS_60d28e8bdf3b', name='coldStartStrategy', doc="strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'."): 'nan',
  Param(parent='ALS_60d28e8bdf3b', name='itemCol', doc='column name for item ids. Ids must be within the integer value range.'): 'item',
  Param(parent='ALS_60d28e8bdf3b', name='predictionCol', doc='prediction column name.'): 'prediction',
  Param(parent='ALS_60d28e8bdf3b', name='userCol', doc='column name for user ids. Ids must be within the integer value range.'): 'user'},
 '_java_obj': JavaObject id=o57

## Best ALS Model

In [37]:
best_als = ALS(maxIter=5,rank=50, regParam=0.1, userCol='user_id_num', itemCol='book_id', 
                ratingCol='rating', coldStartStrategy='drop')

In [38]:
best_als_model = best_als.fit(training)

In [39]:
best_als_predictions = best_als_model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                                predictionCol='prediction')
rmse = evaluator.evaluate(best_als_predictions)
print('Root-mean-square error = ' + str(rmse))

Root-mean-square error = 1.092242472532004
