# Recommendation de films pour les clients du centre commercial 

In [1]:
from pyspark.sql.functions import rand
from pyspark.sql.types import IntegerType

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
ratings_movies = spark.read.csv('./data/ratings_movies.csv', inferSchema=True, header=True)

In [3]:
ratings_movies.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    110|   1.0|1425941529|
|     1|    147|   4.5|1425942435|
|     1|    858|   5.0|1425941523|
|     1|   1221|   5.0|1425941546|
|     1|   1246|   5.0|1425941556|
|     1|   1968|   4.0|1425942148|
|     1|   2762|   4.5|1425941300|
|     1|   2918|   5.0|1425941593|
|     1|   2959|   4.0|1425941601|
|     1|   4226|   4.0|1425942228|
|     1|   4878|   5.0|1425941434|
|     1|   5577|   5.0|1425941397|
|     1|  33794|   4.0|1425942005|
|     1|  54503|   3.5|1425941313|
|     1|  58559|   4.0|1425942007|
|     1|  59315|   5.0|1425941502|
|     1|  68358|   5.0|1425941464|
|     1|  69844|   5.0|1425942139|
|     1|  73017|   5.0|1425942699|
|     1|  81834|   5.0|1425942133|
+------+-------+------+----------+
only showing top 20 rows



In [4]:
ratings_movies.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [42]:
df1 = ratings_movies.withColumn('userId', (rand()*200+1))

In [43]:
df1 = df1.withColumn('userId', df1['userId'].cast(IntegerType()).alias('userId'))

In [44]:
df1 = df1.withColumn('movieId', df1['movieId'].cast(IntegerType()).alias('movieId'))

In [45]:
df1.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|   128|    110|   1.0|1425941529|
|    61|    147|   4.5|1425942435|
|    43|    858|   5.0|1425941523|
|   184|   1221|   5.0|1425941546|
|    30|   1246|   5.0|1425941556|
|   149|   1968|   4.0|1425942148|
|   133|   2762|   4.5|1425941300|
|   125|   2918|   5.0|1425941593|
|   101|   2959|   4.0|1425941601|
|   122|   4226|   4.0|1425942228|
|    39|   4878|   5.0|1425941434|
|   164|   5577|   5.0|1425941397|
|   117|  33794|   4.0|1425942005|
|    19|  54503|   3.5|1425941313|
|    77|  58559|   4.0|1425942007|
|   120|  59315|   5.0|1425941502|
|    47|  68358|   5.0|1425941464|
|    85|  69844|   5.0|1425942139|
|    68|  73017|   5.0|1425942699|
|    50|  81834|   5.0|1425942133|
+------+-------+------+----------+
only showing top 20 rows



In [46]:
df1.select('userId').describe().show()

+-------+------------------+
|summary|            userId|
+-------+------------------+
|  count|          26024289|
|   mean|100.49032136862606|
| stddev|57.742238530022455|
|    min|                 1|
|    max|               200|
+-------+------------------+



In [47]:
df1.select('movieId').describe().show()

+-------+------------------+
|summary|           movieId|
+-------+------------------+
|  count|          26024289|
|   mean|15849.109677040553|
| stddev|31085.257531391508|
|    min|                 1|
|    max|            176275|
+-------+------------------+



In [48]:
df1.select('rating').describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          26024289|
|   mean|3.5280903543608817|
| stddev|1.0654427636662405|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [12]:
train, test = df1.randomSplit([0.7, 0.3], seed = 500)

In [13]:
print('Shape train data:',(train.count(), len(train.columns)))

Shape train data: (18219215, 4)


In [14]:
print('Shape train data:',(test.count(), len(test.columns)))

Shape train data: (7805074, 4)


In [15]:
model = ALS(userCol='userId', itemCol='movieId', ratingCol='rating').fit(train)

In [16]:
predictions = model.transform(test)

In [17]:
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   148|    148|   1.0| 853826411| 2.7822294|
|    31|    148|   3.0| 835810175| 2.7808115|
|    65|    148|   2.0| 976870439| 2.7814708|
|    53|    148|   2.0|1030400425| 2.7806585|
|   108|    148|   3.0| 860111242|  2.777372|
|   155|    148|   3.0| 864869040| 2.7774699|
|    34|    148|   5.0| 832703670| 2.7820292|
|    34|    148|   3.0| 970170090| 2.7820292|
|   193|    148|   3.0| 945124706|  2.782248|
|   193|    148|   4.0|1015193260|  2.782248|
|   101|    148|   3.0| 851452074| 2.7759292|
|   115|    148|   3.0|1058985403| 2.7790785|
|   126|    148|   2.0| 848773886| 2.7834103|
|   126|    148|   4.0|1500217059| 2.7834103|
|    27|    148|   4.0| 828021782| 2.7800539|
|    44|    148|   4.0| 979367598| 2.7807093|
|   103|    148|   4.0| 832057800| 2.7820318|
|    12|    148|   1.0|1047357247|  2.781437|
|   122|    148|   5.0| 842004154|

In [19]:
predictions = predictions.na.drop()

In [20]:
evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print('rmse =',rmse)

rmse = 0.9767405099735131


In [23]:
print('Shape of predictions:',(predictions.count(), len(predictions.columns)))

Shape of predictions: (7801510, 5)


In [36]:
movies = spark.read.csv('./data/movies_metadata.csv', inferSchema=True, header=True, sep=',')

In [38]:
movies.show(2)

+-----+---------------------+--------+--------------------+--------------------+----+---------+-----------------+--------------+--------------------+----------+--------------------+--------------------+--------------------+------------+---------+-------+--------------------+--------+--------------------+---------+-----+------------+----------+
|adult|belongs_to_collection|  budget|              genres|            homepage|  id|  imdb_id|original_language|original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|  revenue|runtime|    spoken_languages|  status|             tagline|    title|video|vote_average|vote_count|
+-----+---------------------+--------+--------------------+--------------------+----+---------+-----------------+--------------+--------------------+----------+--------------------+--------------------+--------------------+------------+---------+-------+--------------------+--------+--------------------+---

In [61]:
recommendation = model.recommendForItemSubset(test,numUsers=3)

In [66]:
df2 = recommendation.join(movies, movies.id == recommendation.movieId).select(recommendation["*"],movies["title"])

In [68]:
df2.show(truncate=False)

+-------+-----------------------------------------------------+-----------------------------------------------------------------------------------+
|movieId|recommendations                                      |title                                                                              |
+-------+-----------------------------------------------------+-----------------------------------------------------------------------------------+
|148    |[[83, 2.7878337], [169, 2.7877886], [168, 2.787404]] |The Secret Life of Words                                                           |
|471    |[[83, 3.4891572], [169, 3.4891005], [168, 3.4886198]]|Bandyta                                                                            |
|496    |[[83, 3.1753914], [169, 3.17534], [168, 3.1749024]]  |Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan|
|833    |[[83, 2.6024714], [169, 2.6024292], [168, 2.6020706]]|Umberto D.                                       