# Exploration des données de films

In [4]:
from pyspark.sql.functions import rand
from pyspark.sql.types import IntegerType

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

import os
os.chdir('..')

In [6]:
ratings_movies = spark.read.csv('../../data/ratings_movies.csv', inferSchema=True, header=True)

In [7]:
ratings_movies.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    110|   1.0|1425941529|
|     1|    147|   4.5|1425942435|
|     1|    858|   5.0|1425941523|
|     1|   1221|   5.0|1425941546|
|     1|   1246|   5.0|1425941556|
|     1|   1968|   4.0|1425942148|
|     1|   2762|   4.5|1425941300|
|     1|   2918|   5.0|1425941593|
|     1|   2959|   4.0|1425941601|
|     1|   4226|   4.0|1425942228|
|     1|   4878|   5.0|1425941434|
|     1|   5577|   5.0|1425941397|
|     1|  33794|   4.0|1425942005|
|     1|  54503|   3.5|1425941313|
|     1|  58559|   4.0|1425942007|
|     1|  59315|   5.0|1425941502|
|     1|  68358|   5.0|1425941464|
|     1|  69844|   5.0|1425942139|
|     1|  73017|   5.0|1425942699|
|     1|  81834|   5.0|1425942133|
+------+-------+------+----------+
only showing top 20 rows



In [8]:
ratings_movies.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [9]:
df1 = ratings_movies.withColumn('userId', (rand()*200+1))

In [10]:
df1 = df1.withColumn('userId', df1['userId'].cast(IntegerType()).alias('userId'))

In [11]:
df1 = df1.withColumn('movieId', df1['movieId'].cast(IntegerType()).alias('movieId'))

In [12]:
df1.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|   176|    110|   1.0|1425941529|
|   184|    147|   4.5|1425942435|
|    45|    858|   5.0|1425941523|
|   108|   1221|   5.0|1425941546|
|   150|   1246|   5.0|1425941556|
|    10|   1968|   4.0|1425942148|
|     2|   2762|   4.5|1425941300|
|     2|   2918|   5.0|1425941593|
|   128|   2959|   4.0|1425941601|
|    25|   4226|   4.0|1425942228|
|    53|   4878|   5.0|1425941434|
|   136|   5577|   5.0|1425941397|
|   130|  33794|   4.0|1425942005|
|   125|  54503|   3.5|1425941313|
|    49|  58559|   4.0|1425942007|
|    38|  59315|   5.0|1425941502|
|    42|  68358|   5.0|1425941464|
|   186|  69844|   5.0|1425942139|
|    57|  73017|   5.0|1425942699|
|    62|  81834|   5.0|1425942133|
+------+-------+------+----------+
only showing top 20 rows



In [13]:
df1.select('userId').describe().show()

+-------+------------------+
|summary|            userId|
+-------+------------------+
|  count|          26024289|
|   mean|100.47504437104891|
| stddev|57.729545723084655|
|    min|                 1|
|    max|               200|
+-------+------------------+



In [14]:
df1.select('movieId').describe().show()

+-------+------------------+
|summary|           movieId|
+-------+------------------+
|  count|          26024289|
|   mean|15849.109677040553|
| stddev|31085.257531391508|
|    min|                 1|
|    max|            176275|
+-------+------------------+



In [15]:
df1.select('rating').describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          26024289|
|   mean|3.5280903543608817|
| stddev|1.0654427636662405|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [16]:
train, test = df1.randomSplit([0.7, 0.3], seed = 500)

In [17]:
print('Shape train data:',(train.count(), len(train.columns)))

Shape train data: (18219215, 4)


In [18]:
print('Shape train data:',(test.count(), len(test.columns)))

Shape train data: (7805074, 4)
