In [1]:
data = spark.read.csv("gs://rs-movielens-summer2020-1/ratings.csv", header=True, inferSchema=True)

In [2]:
data.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [3]:
# Convert your dataframe into an RDD
ratings = data.rdd.map(list)
ratings.take(2)

[[1, 1, 4.0, 964982703], [1, 3, 4.0, 964981247]]

In [4]:
# Import the Rating object
from pyspark.mllib.recommendation import Rating
# Convert the data into Rating objects
ratings_final = ratings.map(lambda line: Rating(int(line[0]), int(line[1]), float(line[2])));
# This is what a Rating object looks like
ratings_final.take(2)

[Rating(user=1, product=1, rating=4.0), Rating(user=1, product=3, rating=4.0)]

In [5]:
# Split the data into training and test, in 80-20% ratio
training_data, test_data = ratings_final.randomSplit([0.8,0.2]);

In [11]:
# Import the ALS method
from pyspark.mllib.recommendation import ALS
# Build the model based on the training data, with tank = 10 and iterations = 10
model = ALS.train(training_data, rank=10, iterations=10)

In [12]:
# Drop the ratings column
testdata_no_rating = test_data.map(lambda p: (p[0],p[1]))
testdata_no_rating.take(2)

[(1, 70), (1, 157)]

In [13]:
# Predict the model  
predictions = model.predictAll(testdata_no_rating)

In [16]:
# Print the first rows of the RDD
predictions.take(2)

[Rating(user=140, product=1084, rating=3.7536287191506865),
 Rating(user=590, product=1084, rating=3.864350895141766)]

In [21]:
# Prepare ratings data
rates = ratings_final.map(lambda r: ((r[0],r[1]),r[2]));
rates.take(2)

[((1, 1), 4.0), ((1, 3), 4.0)]

In [22]:
# Prepare predictions data
preds = predictions.map(lambda r: ((r[0],r[1]),r[2]))
preds.take(2)

[((140, 1084), 3.7536287191506865), ((590, 1084), 3.864350895141766)]

In [23]:
# Join the ratings data with predictions data
rates_and_preds = rates.join(preds)
rates_and_preds.take(2)

[((1, 333), (5.0, 3.5464377617999965)), ((1, 527), (5.0, 5.115065993718315))]

In [24]:
# Calculate and print MSE
MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error of the model for the test data = {:.2f}".format(MSE))

Mean Squared Error of the model for the test data = 1.24
