In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Installing PySpark

In [None]:
!pip install pyspark

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

Starting Spark Session with Context

In [None]:
sc = SparkContext('local')
spark = SparkSession(sc)

To read the data, we first need to desin an scheme with appropriate data types, matching with CSV file inputs. Our dataset has three fields namely userID, songID, and rating.

In [None]:
input_schema = StructType([
    StructField('userID',IntegerType(), False),
    StructField('songID',IntegerType(), False),
    StructField('rating',IntegerType(), False),
])
data = spark.read.csv(
    '../input/dataset-for-collaborative-filters/songsDataset.csv', header=True, schema=input_schema
).cache()

Now, we split the data into training and test set, 78% testing and 22% test.

In [None]:
(training, test) = data.randomSplit([0.78, 0.22])

Now that we have required training and test dataset, we build collaborative filtering Alternate Least Squares model offered by PySpark. To solve the cold square problem, we use 'drop' strategy. We run our model for iterations.

In [None]:
als = ALS(maxIter=10, regParam=0.01, userCol="userID", itemCol="songID", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

Now we make predictions.

In [None]:
predictions = model.transform(test)

We evaulate our model using MSE i.e., Mean Square Error metric.

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [None]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

We find that the RMSE is 5.97. We can increase the number of iterations to better our result.