In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, regexp_extract, regexp_replace, explode, split
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("testing").getOrCreate()

In [3]:
myrange = spark.range(1000).toDF("number")

In [4]:
myrange.show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



In [5]:
spark

In [6]:
myrange.count()

1000

In [7]:
file_path = "Data/raw/movies.dat"
schema = "movie_id INT, title STRING, genres STRING"

# Read the data from the movies.dat file
movies_data = spark.read.csv(file_path, sep="::", schema=schema)

In [8]:
movies_data.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|Adventure|Children's|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11|American Presiden...|Comedy|Drama|Romance|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Animation|Children's|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventure|...|
|      16|       Casino (1995)|      Drama|Thr

In [9]:
movies_data.count()

3883

In [10]:
movies_data

DataFrame[movie_id: int, title: string, genres: string]

In [11]:
# Extract the year from the title and create a new column 'year'
movies_data = movies_data.withColumn("release_year", regexp_extract("title", "\((\d{4})\)", 1))

In [12]:
movies_data.show()

+--------+--------------------+--------------------+------------+
|movie_id|               title|              genres|release_year|
+--------+--------------------+--------------------+------------+
|       1|    Toy Story (1995)|Animation|Childre...|        1995|
|       2|      Jumanji (1995)|Adventure|Childre...|        1995|
|       3|Grumpier Old Men ...|      Comedy|Romance|        1995|
|       4|Waiting to Exhale...|        Comedy|Drama|        1995|
|       5|Father of the Bri...|              Comedy|        1995|
|       6|         Heat (1995)|Action|Crime|Thri...|        1995|
|       7|      Sabrina (1995)|      Comedy|Romance|        1995|
|       8| Tom and Huck (1995)|Adventure|Children's|        1995|
|       9| Sudden Death (1995)|              Action|        1995|
|      10|    GoldenEye (1995)|Action|Adventure|...|        1995|
|      11|American Presiden...|Comedy|Drama|Romance|        1995|
|      12|Dracula: Dead and...|       Comedy|Horror|        1995|
|      13|

In [13]:
# Remove the (year) from the title
movies_data = movies_data.withColumn("title", regexp_replace("title", "\s*\(\d{4}\)\s*", ""))

In [14]:
movies_data.show()

+--------+--------------------+--------------------+------------+
|movie_id|               title|              genres|release_year|
+--------+--------------------+--------------------+------------+
|       1|           Toy Story|Animation|Childre...|        1995|
|       2|             Jumanji|Adventure|Childre...|        1995|
|       3|    Grumpier Old Men|      Comedy|Romance|        1995|
|       4|   Waiting to Exhale|        Comedy|Drama|        1995|
|       5|Father of the Bri...|              Comedy|        1995|
|       6|                Heat|Action|Crime|Thri...|        1995|
|       7|             Sabrina|      Comedy|Romance|        1995|
|       8|        Tom and Huck|Adventure|Children's|        1995|
|       9|        Sudden Death|              Action|        1995|
|      10|           GoldenEye|Action|Adventure|...|        1995|
|      11|American Presiden...|Comedy|Drama|Romance|        1995|
|      12|Dracula: Dead and...|       Comedy|Horror|        1995|
|      13|

In [15]:
# Extract and explode genres into separate rows
movies_data = movies_data.withColumn("genres", explode(split("genres", "\\|")))

In [16]:
movies_data.show()

+--------+--------------------+----------+------------+
|movie_id|               title|    genres|release_year|
+--------+--------------------+----------+------------+
|       1|           Toy Story| Animation|        1995|
|       1|           Toy Story|Children's|        1995|
|       1|           Toy Story|    Comedy|        1995|
|       2|             Jumanji| Adventure|        1995|
|       2|             Jumanji|Children's|        1995|
|       2|             Jumanji|   Fantasy|        1995|
|       3|    Grumpier Old Men|    Comedy|        1995|
|       3|    Grumpier Old Men|   Romance|        1995|
|       4|   Waiting to Exhale|    Comedy|        1995|
|       4|   Waiting to Exhale|     Drama|        1995|
|       5|Father of the Bri...|    Comedy|        1995|
|       6|                Heat|    Action|        1995|
|       6|                Heat|     Crime|        1995|
|       6|                Heat|  Thriller|        1995|
|       7|             Sabrina|    Comedy|      

In [17]:
movies_data.count()

6408

In [97]:
spark.stop()