In [1]:
from pyspark.mllib.recommendation import ALS, Rating
from pyspark import SparkContext
import numpy as np

In [2]:
sc = SparkContext.getOrCreate()



In [3]:
data = sc.textFile("u.data")
ratings = data.map(lambda line: line.split()).map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))

train_data, test_data = ratings.randomSplit([0.8, 0.2], 42)

model = ALS.train(train_data, rank=10, iterations=10)

user_id = 225
recommendations = model.recommendProducts(user_id, 5)


In [4]:
item_data = sc.textFile("u.item")
movie_titles = item_data.map(lambda line: line.split('|')).map(lambda x: (int(x[0]), x[1])).collectAsMap()

print("Top 5 recommendations for User", user_id)
for i, rec in enumerate(recommendations):
    movie_title = movie_titles.get(rec.product, f"Movie {rec.product}")
    print(f"{i+1}. {movie_title} (score: {rec.rating:.2f})")

Top 5 recommendations for User 225
1. Warriors of Virtue (1997) (score: 10.31)
2. Mina Tannenbaum (1994) (score: 9.69)
3. Alphaville (1965) (score: 8.94)
4. Barbarella (1968) (score: 8.58)
5. In the Realm of the Senses (Ai no corrida) (1976) (score: 8.54)


In [5]:
test_input = test_data.map(lambda p: (p.user, p.product))
predictions = model.predictAll(test_input).map(lambda r: ((r.user, r.product), r.rating))
rates_and_preds = test_data.map(lambda r: ((r.user, r.product), r.rating)).join(predictions)
mse = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print(f"Model MSE: {mse:.4f}")

sc.stop()

Model MSE: 1.1706
