In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rec-system').getOrCreate()

In [2]:
from pyspark.ml.recommendation import ALS

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
data = spark.read.csv('/FileStore/tables/movielens_ratings.csv', inferSchema=True, header=True)

In [5]:
data.show()

In [6]:
data.select('userId').groupby('userId').count().sort('userId').show(100)

In [7]:
data.select('userId').groupby('userId').count().show(100)

In [8]:
data.describe().show()

In [9]:
training, test = data.randomSplit([0.8, 0.2])

In [10]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [11]:
model = als.fit(training)

In [12]:
predictions = model.transform(test)

In [13]:
predictions.show()

In [14]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [15]:
rmse = evaluator.evaluate(predictions)

In [16]:
print('RMSE')
print(rmse)

In [17]:
single_user = test.filter(test['userId']==11).select(['movieId', 'userId'])
single_user.show()

In [18]:
recommendations = model.transform(single_user)

In [19]:
recommendations.orderBy('prediction', ascending=False).show()