In [2]:
data = spark.read.csv("gs://rs-movielens-1/ratings.csv", header=True, inferSchema=True)

In [3]:
data.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [5]:
print(data.rdd.take(2)) # viewing 2 rows of the raw rdd

# Convert your dataframe into an RDD, and then into a list
ratings = data.rdd.map(list)
ratings.take(2)

[Row(userId=1, movieId=1, rating=4.0, timestamp=964982703), Row(userId=1, movieId=3, rating=4.0, timestamp=964981247)]


[[1, 1, 4.0, 964982703], [1, 3, 4.0, 964981247]]

In [6]:
# Import the Rating object
from pyspark.mllib.recommendation import Rating
# Convert the data into Rating objects
ratings_data = ratings.map(lambda line: Rating(int(line[0]), int(line[1]), float(line[2])));
# This is what a Rating object looks like
ratings_data.take(2)

[Rating(user=1, product=1, rating=4.0), Rating(user=1, product=3, rating=4.0)]

In [7]:
# Split the data into training and test, in 80-20% ratio
training_data, test_data = ratings_data.randomSplit([0.8,0.2]);

In [8]:
# Import the ALS method
from pyspark.mllib.recommendation import ALS
# Build the model based on the training data, with tank = 10 and iterations = 10
model = ALS.train(training_data, rank=10, iterations=10)

In [9]:
# Drop the ratings column
testdata_nr = test_data.map(lambda p: (p[0],p[1]))
testdata_nr.take(2)

[(1, 3), (1, 163)]

In [10]:
# Predict the model  
predictions = model.predictAll(testdata_nr)

In [11]:
# Print the first rows of the RDD
predictions.take(2)

[Rating(user=32, product=1084, rating=4.121777284904251),
 Rating(user=4, product=1084, rating=4.00846783159416)]

In [12]:
# Prepare ratings data
ratings_kv = ratings_data.map(lambda r: ((r[0],r[1]),r[2]));
ratings_kv.take(2)

[((1, 1), 4.0), ((1, 3), 4.0)]

In [13]:
# Prepare predictions data
predictions_kv = predictions.map(lambda r: ((r[0],r[1]),r[2]))
predictions_kv.take(2)

[((32, 1084), 4.121777284904251), ((4, 1084), 4.00846783159416)]

In [14]:
# Join the ratings data with predictions data
ratings_predictions = ratings_kv.join(predictions_kv)
ratings_predictions.take(2)

[((1, 3), (4.0, 4.162075958965319)), ((1, 316), (3.0, 4.537450779249192))]

In [15]:
# Calculate and print MSE
MSE = ratings_predictions.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error of the model for the test data = {:.2f}".format(MSE))

Mean Squared Error of the model for the test data = 1.27
