In [2]:
# Import the libraries
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# Create a spark session
spark = SparkSession.builder.appName("reco_sys").getOrCreate()

In [4]:
# Read the movie lens dataset
data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)

In [5]:
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [6]:
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [7]:
# Create train and test data
train_data, test_data = data.randomSplit([0.8, 0.2])

In [8]:
als = ALS(maxIter=5, userCol='userId', ratingCol='rating', itemCol='movieId')

In [10]:
# Create a model
model = als.fit(train_data)

In [11]:
# Transform the test data
predictions = model.transform(test_data)
predictions.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    27| 1.6432921|
|     31|   1.0|     4|  1.681254|
|     31|   3.0|     8| 2.3371558|
|     31|   1.0|    24| 1.7270483|
|     31|   1.0|    18| 1.9813324|
|     85|   3.0|     1| 1.5863472|
|     85|   5.0|     8| 2.2623506|
|     85|   1.0|    25|  2.270863|
|     85|   1.0|    29| 1.5524962|
|     85|   3.0|    21|  2.060089|
|     85|   1.0|     2| 2.0120003|
|     65|   2.0|     5|  2.544626|
|     65|   2.0|    15| 1.3021828|
|     53|   1.0|    12| 0.1732964|
|     53|   5.0|     8|  2.751516|
|     53|   1.0|    23|0.26036254|
|     78|   1.0|    19| 0.7334022|
|     78|   1.0|     4| 0.9408511|
|     34|   1.0|    14| 1.1587507|
|     81|   5.0|    28| 1.6210208|
+-------+------+------+----------+
only showing top 20 rows



In [16]:
# Evaluate the model
reg = RegressionEvaluator(predictionCol='prediction', metricName='rmse', labelCol="rating" )

In [17]:
rmse = reg.evaluate(predictions)
print("RMSE: ", rmse)

RMSE:  1.1065689568637513


In [18]:
# Take a single user
single_user = test_data.filter(test_data['userId']==11).select(["movieId", "userId"])

In [19]:
user_pred = model.transform(single_user)

In [22]:
user_pred.orderBy("prediction", ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     90|    11|  3.683182|
|     94|    11| 2.9359972|
|     64|    11| 2.8253148|
|     19|    11|  2.676443|
|     48|    11| 2.6381097|
|     75|    11| 2.6334357|
|     50|    11| 2.3115864|
|     76|    11|  2.201033|
|     22|    11|  2.199746|
|     36|    11|   1.89679|
|     43|    11| 1.8242611|
|     70|    11| 1.2850212|
|     45|    11| 1.0862978|
|     86|    11| 0.9888788|
+-------+------+----------+

