In [1]:
from pyspark.sql import SparkSession

def build_spark_session(app_name, memory='4g', executors=4):
    return SparkSession.builder\
                        .appName(app_name)\
                        .config('spark.executor.memory', memory)\
                        .config('spark.executor.instances', executors)\
                        .getOrCreate()

In [2]:
spark_session = build_spark_session(app_name='ok-google')

from pyspark.sql import functions as f

In [3]:
# MovieLens data
df = spark_session.read.csv("C:\\Users\\DV66B52\\Documents\\Università\\ERASMUS\\Dauphine\\Corsi\\Big Data\\FinalProject\\ml-latest-small\\ratings.csv", header=True)

In [5]:
df.head()

Row(userId='1', movieId='31', rating='2.5', timestamp='1260759144')

In [26]:
df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [4]:
users = df.select('userId').distinct()
movies = df.select('movieId').distinct()

In [5]:
# Ratings
df.select('rating').distinct().collect()

[Row(rating='1.0'),
 Row(rating='4.5'),
 Row(rating='2.5'),
 Row(rating='3.5'),
 Row(rating='5.0'),
 Row(rating='0.5'),
 Row(rating='4.0'),
 Row(rating='1.5'),
 Row(rating='2.0'),
 Row(rating='3.0')]

In [6]:
# Casting ratings into double
df_0 = df.select(['userId', 'movieId'] + [df['rating'].cast('double')])

In [18]:
df_bin = df_0.select(['userId', 'movieId'] + [f.when(df_0['rating'] < 2.5, 0).otherwise(1)]).show(n=10)


+------+-------+------------------------------------------+
|userId|movieId|CASE WHEN (rating < 2.5) THEN 0 ELSE 1 END|
+------+-------+------------------------------------------+
|     1|     31|                                         1|
|     1|   1029|                                         1|
|     1|   1061|                                         1|
|     1|   1129|                                         0|
|     1|   1172|                                         1|
|     1|   1263|                                         0|
|     1|   1287|                                         0|
|     1|   1293|                                         0|
|     1|   1339|                                         1|
|     1|   1343|                                         0|
+------+-------+------------------------------------------+
only showing top 10 rows



In [79]:
df.show(n=15)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
+------+-------+------+----------+
only showing top 15 rows

