In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, regexp_extract, regexp_replace, explode, split
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder.appName("testing").getOrCreate()

In [4]:
spark

In [5]:
# Movies data path and schema
movies_file_path = "d:/PROJECTS/Movie_Analytics/Data/raw/movies.dat"
movies_schema = "movie_id INT, title STRING, genres STRING"

# Users data path and schema
users_file_path = "d:/PROJECTS/Movie_Analytics/Data/raw/users.dat"
users_schema = "UserID INT, Gender STRING, Age INT, Occupation INT, ZipCode INT"

# Ratings data path and schema
ratings_schema = "UserID INT, MovieID INT, Rating INT, Timestamp LONG"


# Reading the data from the movies.dat file
movies_data = spark.read.csv(movies_file_path, sep="::", schema=movies_schema)

# Reading the data from the users.dat file
users_data = spark.read.csv(users_file_path, sep="::",schema=users_schema)

# Reading the data from the ratings.dat files
ratings_data = spark.read.csv("d:/PROJECTS/Movie_Analytics/Data/raw/ratings.dat",sep = "::",schema=ratings_schema)

In [6]:
movies_data.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|Adventure|Children's|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11|American Presiden...|Comedy|Drama|Romance|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Animation|Children's|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventure|...|
|      16|       Casino (1995)|      Drama|Thr

In [7]:
movies_data.count()

3883

In [8]:
movies_data

DataFrame[movie_id: int, title: string, genres: string]

In [9]:
movies_data.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|Adventure|Children's|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11|American Presiden...|Comedy|Drama|Romance|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Animation|Children's|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventure|...|
|      16|       Casino (1995)|      Drama|Thr

In [10]:
# Extract the year from the title and create a new column 'year'
movies_data = movies_data.withColumn("release_year", regexp_extract("title", "\((\d{4})\)", 1))

In [11]:
movies_data.show()

+--------+--------------------+--------------------+------------+
|movie_id|               title|              genres|release_year|
+--------+--------------------+--------------------+------------+
|       1|    Toy Story (1995)|Animation|Childre...|        1995|
|       2|      Jumanji (1995)|Adventure|Childre...|        1995|
|       3|Grumpier Old Men ...|      Comedy|Romance|        1995|
|       4|Waiting to Exhale...|        Comedy|Drama|        1995|
|       5|Father of the Bri...|              Comedy|        1995|
|       6|         Heat (1995)|Action|Crime|Thri...|        1995|
|       7|      Sabrina (1995)|      Comedy|Romance|        1995|
|       8| Tom and Huck (1995)|Adventure|Children's|        1995|
|       9| Sudden Death (1995)|              Action|        1995|
|      10|    GoldenEye (1995)|Action|Adventure|...|        1995|
|      11|American Presiden...|Comedy|Drama|Romance|        1995|
|      12|Dracula: Dead and...|       Comedy|Horror|        1995|
|      13|

In [12]:
# Remove the (year) from the title
movies_data = movies_data.withColumn("title", regexp_replace("title", "\s*\(\d{4}\)\s*", ""))

In [13]:
movies_data.show()

+--------+--------------------+--------------------+------------+
|movie_id|               title|              genres|release_year|
+--------+--------------------+--------------------+------------+
|       1|           Toy Story|Animation|Childre...|        1995|
|       2|             Jumanji|Adventure|Childre...|        1995|
|       3|    Grumpier Old Men|      Comedy|Romance|        1995|
|       4|   Waiting to Exhale|        Comedy|Drama|        1995|
|       5|Father of the Bri...|              Comedy|        1995|
|       6|                Heat|Action|Crime|Thri...|        1995|
|       7|             Sabrina|      Comedy|Romance|        1995|
|       8|        Tom and Huck|Adventure|Children's|        1995|
|       9|        Sudden Death|              Action|        1995|
|      10|           GoldenEye|Action|Adventure|...|        1995|
|      11|American Presiden...|Comedy|Drama|Romance|        1995|
|      12|Dracula: Dead and...|       Comedy|Horror|        1995|
|      13|

In [14]:
# Extract and explode genres into separate rows
movies_data = movies_data.withColumn("genres", explode(split("genres", "\\|")))

In [15]:
movies_data.show()

+--------+--------------------+----------+------------+
|movie_id|               title|    genres|release_year|
+--------+--------------------+----------+------------+
|       1|           Toy Story| Animation|        1995|
|       1|           Toy Story|Children's|        1995|
|       1|           Toy Story|    Comedy|        1995|
|       2|             Jumanji| Adventure|        1995|
|       2|             Jumanji|Children's|        1995|
|       2|             Jumanji|   Fantasy|        1995|
|       3|    Grumpier Old Men|    Comedy|        1995|
|       3|    Grumpier Old Men|   Romance|        1995|
|       4|   Waiting to Exhale|    Comedy|        1995|
|       4|   Waiting to Exhale|     Drama|        1995|
|       5|Father of the Bri...|    Comedy|        1995|
|       6|                Heat|    Action|        1995|
|       6|                Heat|     Crime|        1995|
|       6|                Heat|  Thriller|        1995|
|       7|             Sabrina|    Comedy|      

In [16]:
movies_data

DataFrame[movie_id: int, title: string, genres: string, release_year: string]

In [17]:
movies_data.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = false)
 |-- release_year: string (nullable = true)



In [18]:
movies_data.count()

6408

In [19]:
users_data.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- ZipCode: integer (nullable = true)



In [20]:
users_data

DataFrame[UserID: int, Gender: string, Age: int, Occupation: int, ZipCode: int]

In [21]:
users_data.show()

+------+------+---+----------+-------+
|UserID|Gender|Age|Occupation|ZipCode|
+------+------+---+----------+-------+
|     1|     F|  1|        10|  48067|
|     2|     M| 56|        16|  70072|
|     3|     M| 25|        15|  55117|
|     4|     M| 45|         7|   2460|
|     5|     M| 25|        20|  55455|
|     6|     F| 50|         9|  55117|
|     7|     M| 35|         1|   6810|
|     8|     M| 25|        12|  11413|
|     9|     M| 25|        17|  61614|
|    10|     F| 35|         1|  95370|
|    11|     F| 25|         1|   4093|
|    12|     M| 25|        12|  32793|
|    13|     M| 45|         1|  93304|
|    14|     M| 35|         0|  60126|
|    15|     M| 25|         7|  22903|
|    16|     F| 35|         0|  20670|
|    17|     M| 50|         1|  95350|
|    18|     F| 18|         3|  95825|
|    19|     M|  1|        10|  48073|
|    20|     M| 25|        14|  55113|
+------+------+---+----------+-------+
only showing top 20 rows



In [22]:
ratings_data.show()

+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
|     1|    914|     3|978301968|
|     1|   3408|     4|978300275|
|     1|   2355|     5|978824291|
|     1|   1197|     3|978302268|
|     1|   1287|     5|978302039|
|     1|   2804|     5|978300719|
|     1|    594|     4|978302268|
|     1|    919|     4|978301368|
|     1|    595|     5|978824268|
|     1|    938|     4|978301752|
|     1|   2398|     4|978302281|
|     1|   2918|     4|978302124|
|     1|   1035|     5|978301753|
|     1|   2791|     4|978302188|
|     1|   2687|     3|978824268|
|     1|   2018|     4|978301777|
|     1|   3105|     5|978301713|
|     1|   2797|     4|978302039|
+------+-------+------+---------+
only showing top 20 rows



In [23]:
ratings_data.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- MovieID: integer (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Timestamp: long (nullable = true)



In [24]:
movies_data.createOrReplaceTempView("movies")
users_data.createOrReplaceTempView("users")
ratings_data.createOrReplaceTempView("ratings")


In [25]:
spark.sql("select * from users").show()

+------+------+---+----------+-------+
|UserID|Gender|Age|Occupation|ZipCode|
+------+------+---+----------+-------+
|     1|     F|  1|        10|  48067|
|     2|     M| 56|        16|  70072|
|     3|     M| 25|        15|  55117|
|     4|     M| 45|         7|   2460|
|     5|     M| 25|        20|  55455|
|     6|     F| 50|         9|  55117|
|     7|     M| 35|         1|   6810|
|     8|     M| 25|        12|  11413|
|     9|     M| 25|        17|  61614|
|    10|     F| 35|         1|  95370|
|    11|     F| 25|         1|   4093|
|    12|     M| 25|        12|  32793|
|    13|     M| 45|         1|  93304|
|    14|     M| 35|         0|  60126|
|    15|     M| 25|         7|  22903|
|    16|     F| 35|         0|  20670|
|    17|     M| 50|         1|  95350|
|    18|     F| 18|         3|  95825|
|    19|     M|  1|        10|  48073|
|    20|     M| 25|        14|  55113|
+------+------+---+----------+-------+
only showing top 20 rows

