In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('rec').getOrCreate()

In [3]:
from pyspark.ml.recommendation import ALS

In [4]:
import os 

In [5]:
movie_data_file = os.path.join(os.path.curdir,
                              'data',
                              'movielens_ratings.csv')

In [6]:
movie_data = spark.read.csv(movie_data_file, inferSchema=True,
              header=True)

In [7]:
movie_data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [9]:
movie_data.head().asDict()

{'movieId': 2, 'rating': 3.0, 'userId': 0}

In [10]:
movie_data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [11]:
train_data, test_data = movie_data.randomSplit([0.8, 0.2])

In [14]:
als = ALS(maxIter=5,
         regParam=0.01,
         userCol='userId',
         itemCol='movieId',
         ratingCol='rating')

In [15]:
model = als.fit(train_data)

In [16]:
predictions = model.transform(test_data)

In [17]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    19|  0.8806455|
|     31|   3.0|     7|  1.0070136|
|     31|   1.0|     0|  1.9284484|
|     85|   1.0|    28|-0.10224253|
|     85|   1.0|     4|  2.8309598|
|     85|   5.0|     8|  2.2473035|
|     85|   4.0|     7|  3.4535804|
|     65|   2.0|     5|  0.6435286|
|     65|   2.0|    15|-0.38971466|
|     65|   5.0|    23|-0.04279597|
|     53|   3.0|    13| -0.2677133|
|     53|   5.0|     8|   2.040499|
|     78|   1.0|    17|  1.3066877|
|     78|   1.0|    24|   1.266131|
|     78|   1.0|     2| 0.45244995|
|     34|   1.0|    15| -1.2687591|
|     34|   4.0|     2|  1.4674035|
|     81|   3.0|    26|  3.1733673|
|     81|   1.0|     1|  1.2860491|
|     81|   1.0|    21|   2.348854|
+-------+------+------+-----------+
only showing top 20 rows



In [18]:
from pyspark.ml.evaluation import RegressionEvaluator

In [19]:
evaluator = RegressionEvaluator(metricName='rmse',
                               labelCol='rating',
                               predictionCol='prediction')

In [20]:
evaluator.evaluate(predictions)

1.7826264168823995

In [33]:
single_user = test_data.filter\
(test_data['userId']==11).select(['userId','movieId', 'rating']).orderBy('movieId')

In [29]:
single_user.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    11|      6|   2.0|
|    11|     35|   3.0|
|    11|     37|   2.0|
|    11|     38|   4.0|
|    11|     47|   1.0|
|    11|     48|   5.0|
|    11|     51|   3.0|
|    11|     61|   1.0|
|    11|     67|   1.0|
|    11|     88|   1.0|
+------+-------+------+



In [36]:
predictions.filter\
(predictions['userId']==11).select(['userId','movieId', 'prediction'])\
.orderBy('movieId')\
.show()

+------+-------+-----------+
|userId|movieId| prediction|
+------+-------+-----------+
|    11|      6|  1.0418524|
|    11|     35|-0.14619511|
|    11|     37| -1.8071241|
|    11|     38|  5.2492113|
|    11|     47|  -1.075948|
|    11|     48| -1.2278523|
|    11|     51|  5.8835845|
|    11|     61|   1.529793|
|    11|     67| -3.0346258|
|    11|     88|  3.2776601|
+------+-------+-----------+

