In [1]:
from __future__ import print_function
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("analyse-movie-data").getOrCreate()
spark

In [2]:
df = (
        spark.read
        .format("csv")
        .option("header", "true")
        .option("inferSchema","true")
        .option("mode","DROPMALFORMED")
        .load('data.csv')
    )

In [3]:
df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [4]:
df.show(truncate=False)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure-Animati...|
|      2|      Jumanji (1995)|Adventure-Childre...|
|      3|Grumpier Old Men ...|      Comedy-Romance|
|      4|Waiting to Exhale...|Comedy-Drama-Romance|
|      5|    Jumanji-1 (2019)|Adventure-Childre...|
|      6|    Jumanji-2 (2020)|Adventure-Childre...|
+-------+--------------------+--------------------+



In [5]:
import pyspark.sql.functions as f
movie_title = f.regexp_extract(f.col('title'),'^(.+)\s\(([0-9]{4})\)$', 1)
year = f.regexp_extract(f.col('title'),'^(.+)\s\(([0-9]{4})\)$', 2).cast('int')
genres = f.explode(f.split(df.genres,"-"))
decade = f.concat_ws('-', (f.floor(f.col("year")/10)*10), (f.floor(f.col("year")/10)*10) + f.lit(9))
movies = (
    df.withColumn('movie_title', movie_title)
    .withColumn('year', year)
    .withColumn('genres', genres)
    .withColumn("decade", decade)
)

In [6]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- decade: string (nullable = false)



In [7]:
movies.show(truncate=False)

+-------+------------------------+---------+-----------------+----+---------+
|movieId|title                   |genres   |movie_title      |year|decade   |
+-------+------------------------+---------+-----------------+----+---------+
|1      |Toy Story (1995)        |Adventure|Toy Story        |1995|1990-1999|
|1      |Toy Story (1995)        |Animation|Toy Story        |1995|1990-1999|
|1      |Toy Story (1995)        |Children |Toy Story        |1995|1990-1999|
|1      |Toy Story (1995)        |Comedy   |Toy Story        |1995|1990-1999|
|1      |Toy Story (1995)        |Fantasy  |Toy Story        |1995|1990-1999|
|2      |Jumanji (1995)          |Adventure|Jumanji          |1995|1990-1999|
|2      |Jumanji (1995)          |Children |Jumanji          |1995|1990-1999|
|2      |Jumanji (1995)          |Fantasy  |Jumanji          |1995|1990-1999|
|3      |Grumpier Old Men (1995) |Comedy   |Grumpier Old Men |1995|1990-1999|
|3      |Grumpier Old Men (1995) |Romance  |Grumpier Old Men |19

In [8]:
movies.groupby('decade','genres').count().orderBy('decade','genres').show()

+---------+-----------+-----+
|   decade|     genres|count|
+---------+-----------+-----+
|         |Horror53133|    1|
|1990-1999|  Adventure|    2|
|1990-1999|  Animation|    1|
|1990-1999|   Children|    2|
|1990-1999|     Comedy|    3|
|1990-1999|      Drama|    1|
|1990-1999|    Fantasy|    2|
|1990-1999|    Romance|    2|
|2010-2019|  Adventure|    1|
|2010-2019|   Children|    1|
|2010-2019|    Fantasy|    1|
|2020-2029|  Adventure|    1|
|2020-2029|   Children|    1|
|2020-2029|    Fantasy|    1|
+---------+-----------+-----+



In [9]:
(
    spark.read
        .format("csv")
        .option("header", "true")
        .option("inferSchema","true")        
        .load('data.csv')
).show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure-Animati...|
|   null|               Them |         Horror53133|
|      2|      Jumanji (1995)|Adventure-Childre...|
|      3|Grumpier Old Men ...|      Comedy-Romance|
|      4|Waiting to Exhale...|Comedy-Drama-Romance|
|      5|    Jumanji-1 (2019)|Adventure-Childre...|
|      6|    Jumanji-2 (2020)|Adventure-Childre...|
+-------+--------------------+--------------------+

