# Recomender Exercise

In [52]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics

In [5]:
spark = SparkSession.builder.appName("Recomender").master("local[*]").getOrCreate()

# Prepare Data

In [20]:
schema = T.StructType([T.StructField("userId", T.IntegerType(), True), 
                       T.StructField("movieId", T.IntegerType(), True), 
                       T.StructField("rating", T.IntegerType(), True),
                       T.StructField("timestamp", T.IntegerType(), True)])

In [21]:
df_ratings = spark.read.csv("data/sample_movielens_ratings.txt", header=True, schema=schema)

In [22]:
df_ratings.show(10)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|     3|1424380312|
|     0|      3|     1|1424380312|
|     0|      5|     2|1424380312|
|     0|      9|     4|1424380312|
|     0|     11|     1|1424380312|
|     0|     12|     2|1424380312|
|     0|     15|     1|1424380312|
|     0|     17|     1|1424380312|
|     0|     19|     1|1424380312|
|     0|     21|     1|1424380312|
+------+-------+------+----------+
only showing top 10 rows



In [23]:
train_df, test_df = df_ratings.randomSplit([0.8, 0.2])

# Train Model

In [24]:
als = ALS().setMaxIter(5)\
    .setRegParam(0.01)\
    .setUserCol("userId")\
    .setItemCol("movieId")\
    .setRatingCol("rating")

In [25]:
als_model = als.fit(train_df)

#  Make Predictions

In [26]:
predictions = als_model.transform(test_df)

In [32]:
predictions.filter(F.col("userId")==12).orderBy(F.desc("prediction")).show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|    12|      7|     3|1424380312| 4.1294384|
|    12|     60|     1|1424380312| 3.0038013|
|    12|     16|     4|1424380312| 2.2946267|
|    12|     31|     4|1424380312| 1.8449553|
|    12|     66|     3|1424380312| 1.4081788|
|    12|     14|     1|1424380312| 1.2755138|
|    12|     57|     1|1424380312| 1.2601724|
|    12|     86|     1|1424380312| 1.0170988|
|    12|     21|     1|1424380312| 0.8252791|
|    12|      6|     1|1424380312|0.36230522|
|    12|     98|     2|1424380312|-0.6299471|
|    12|     96|     1|1424380312|-2.1443143|
+------+-------+------+----------+----------+



In [33]:
als_model.recommendForAllUsers(10).selectExpr("userId", "explode(recommendations)").show()

+------+---------------+
|userId|            col|
+------+---------------+
|    28|[92, 5.2121725]|
|    28|[12, 4.7814813]|
|    28| [81, 4.684808]|
|    28|[28, 4.6136627]|
|    28| [49, 4.270647]|
|    28|[91, 4.0818152]|
|    28|[89, 3.9107444]|
|    28| [2, 3.8538032]|
|    28|[82, 3.8029647]|
|    28|[68, 3.8022418]|
|    26| [32, 8.092326]|
|    26| [25, 6.237586]|
|    26|  [94, 5.16172]|
|    26|[23, 5.0169964]|
|    26|[65, 4.9577723]|
|    26|[22, 4.8798947]|
|    26| [7, 4.8716254]|
|    26|[88, 4.7927804]|
|    26|[46, 4.4969087]|
|    26|   [10, 4.4586]|
+------+---------------+
only showing top 20 rows



In [34]:
als_model.recommendForAllItems(10).selectExpr("movieId", "explode(recommendations)").show()

+-------+---------------+
|movieId|            col|
+-------+---------------+
|     31| [14, 2.699049]|
|     31| [6, 2.5747516]|
|     31|  [8, 2.095369]|
|     31| [25, 1.980439]|
|     31|[21, 1.9674375]|
|     31|[12, 1.8449553]|
|     31|  [7, 1.829401]|
|     31|[22, 1.5002669]|
|     31|  [3, 1.440098]|
|     31| [9, 1.3575178]|
|     85|  [4, 4.994902]|
|     85| [8, 4.9413614]|
|     85| [16, 4.914472]|
|     85|    [10, 4.044]|
|     85|[22, 3.5222337]|
|     85| [7, 3.5140991]|
|     85|[14, 3.3588111]|
|     85|[21, 3.3136427]|
|     85| [6, 3.2266011]|
|     85|  [1, 2.870932]|
+-------+---------------+
only showing top 20 rows



# Evaluators

## Regression Metrics

In [48]:
reg_comparison = predictions.select("rating", "prediction")\
    .rdd.map(lambda x: (float(x["rating"]), float(x["prediction"])))
metrics = RegressionMetrics(reg_comparison)

In [51]:
metrics.meanAbsoluteError

1.2369797419345676

## Ranking Metrics

In [59]:
per_user_actual = predictions.filter(F.col("rating") > 2.5).groupBy("userID")\
    .agg(F.expr("collect_set(movieId) as movies"))

In [60]:
per_user_actual.show()

+------+--------------------+
|userID|              movies|
+------+--------------------+
|    28|                [19]|
|    27|                [27]|
|    26|             [6, 24]|
|    12|     [66, 16, 31, 7]|
|    22|    [70, 32, 62, 98]|
|     1|             [9, 21]|
|    13|            [93, 72]|
|    16|     [5, 54, 47, 29]|
|     3|    [52, 88, 36, 29]|
|    20|                [77]|
|     5|            [50, 68]|
|    19|                [90]|
|    17|[56, 17, 90, 22, ...|
|     9|        [49, 64, 43]|
|     4|                [87]|
|     8|[60, 31, 67, 58, 95]|
|    23|            [49, 87]|
|     7|            [31, 25]|
|    24|[63, 52, 32, 72, 90]|
|    29|        [19, 38, 94]|
+------+--------------------+
only showing top 20 rows



In [61]:
per_user_predictions = predictions.orderBy(F.col("userId"), F.desc("prediction"))\
    .groupBy("userId").agg(F.expr("collect_list(movieId) as movies"))

In [62]:
per_user_predictions.show()

+------+--------------------+
|userId|              movies|
+------+--------------------+
|    28|[15, 19, 54, 38, ...|
|    26|[6, 95, 48, 20, 3...|
|    27|[27, 71, 60, 10, ...|
|    12|[7, 60, 16, 31, 6...|
|    22|[70, 32, 55, 62, ...|
|     1|[2, 13, 82, 14, 9...|
|    13|[52, 62, 93, 72, ...|
|     6|[87, 54, 21, 49, ...|
|    16|[2, 93, 72, 29, 6...|
|     3|[66, 36, 52, 65, ...|
|    20|[4, 84, 55, 91, 6...|
|     5|[81, 95, 51, 50, ...|
|    19|[90, 14, 72, 50, ...|
|    15|[25, 17, 2, 14, 5...|
|     9|[43, 49, 25, 83, ...|
|    17|[90, 82, 17, 55, ...|
|     4|[85, 20, 87, 11, ...|
|     8|[58, 67, 95, 60, ...|
|    23|[49, 66, 33, 87, ...|
|     7|[25, 2, 7, 61, 31...|
+------+--------------------+
only showing top 20 rows



In [63]:
per_user_actual_vpred = per_user_actual.join(per_user_predictions, ["userId"])\
    .rdd.map(lambda row: (row[1], row[2][:15]))
ranks = RankingMetrics(per_user_actual_vpred)

In [64]:
ranks.meanAveragePrecision

0.252172827172827

In [65]:
ranks.precisionAt(5)

0.46923076923076934