In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, regexp_extract, regexp_replace, explode, split
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("testing").getOrCreate()

In [3]:
spark

In [4]:
# Movies data path and schema
movies_file_path = "d:/PROJECTS/Movie_Analytics/Data/raw/movies.dat"
movies_schema = "movie_id INT, title STRING, genres STRING"

# Users data path and schema
users_file_path = "d:/PROJECTS/Movie_Analytics/Data/raw/users.dat"
users_schema = "UserID INT, Gender STRING, Age INT, Occupation INT, ZipCode INT"

# Ratings data path and schema
ratings_schema = "UserID INT, MovieID INT, Rating INT, Timestamp INT"


# Reading the data from the movies.dat file
movies_data = spark.read.csv(movies_file_path, sep="::", schema=movies_schema)

# Reading the data from the users.dat file
users_data = spark.read.csv(users_file_path, sep="::",schema=users_schema)

# Reading the data from the ratings.dat files
ratings_data = spark.read.csv("d:/PROJECTS/Movie_Analytics/Data/raw/ratings.dat",sep = "::",schema=ratings_schema)

In [5]:
movies_data.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|Adventure|Children's|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11|American Presiden...|Comedy|Drama|Romance|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Animation|Children's|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventure|...|
|      16|       Casino (1995)|      Drama|Thr

In [6]:
movies_data.count()

3883

In [7]:
movies_data

DataFrame[movie_id: int, title: string, genres: string]

In [8]:
movies_data.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|Adventure|Children's|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11|American Presiden...|Comedy|Drama|Romance|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Animation|Children's|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventure|...|
|      16|       Casino (1995)|      Drama|Thr

In [9]:
# Extract the year from the title and create a new column 'year'
movies_data = movies_data.withColumn("release_year", regexp_extract("title", r"\((\d{4})\)", 1))

In [10]:
movies_data.show()

+--------+--------------------+--------------------+------------+
|movie_id|               title|              genres|release_year|
+--------+--------------------+--------------------+------------+
|       1|    Toy Story (1995)|Animation|Childre...|        1995|
|       2|      Jumanji (1995)|Adventure|Childre...|        1995|
|       3|Grumpier Old Men ...|      Comedy|Romance|        1995|
|       4|Waiting to Exhale...|        Comedy|Drama|        1995|
|       5|Father of the Bri...|              Comedy|        1995|
|       6|         Heat (1995)|Action|Crime|Thri...|        1995|
|       7|      Sabrina (1995)|      Comedy|Romance|        1995|
|       8| Tom and Huck (1995)|Adventure|Children's|        1995|
|       9| Sudden Death (1995)|              Action|        1995|
|      10|    GoldenEye (1995)|Action|Adventure|...|        1995|
|      11|American Presiden...|Comedy|Drama|Romance|        1995|
|      12|Dracula: Dead and...|       Comedy|Horror|        1995|
|      13|

In [11]:
# Remove the (year) from the title
movies_data = movies_data.withColumn("title", regexp_replace("title", "\s*\(\d{4}\)\s*", ""))

In [12]:
movies_data.show()

+--------+--------------------+--------------------+------------+
|movie_id|               title|              genres|release_year|
+--------+--------------------+--------------------+------------+
|       1|           Toy Story|Animation|Childre...|        1995|
|       2|             Jumanji|Adventure|Childre...|        1995|
|       3|    Grumpier Old Men|      Comedy|Romance|        1995|
|       4|   Waiting to Exhale|        Comedy|Drama|        1995|
|       5|Father of the Bri...|              Comedy|        1995|
|       6|                Heat|Action|Crime|Thri...|        1995|
|       7|             Sabrina|      Comedy|Romance|        1995|
|       8|        Tom and Huck|Adventure|Children's|        1995|
|       9|        Sudden Death|              Action|        1995|
|      10|           GoldenEye|Action|Adventure|...|        1995|
|      11|American Presiden...|Comedy|Drama|Romance|        1995|
|      12|Dracula: Dead and...|       Comedy|Horror|        1995|
|      13|

In [13]:
# Extract and explode genres into separate rows
movies_data = movies_data.withColumn("genres", explode(split("genres", "\\|")))

In [40]:
movies_data.show()

+--------+--------------------+----------+------------+
|movie_id|               title|    genres|release_year|
+--------+--------------------+----------+------------+
|       1|           Toy Story| Animation|        1995|
|       1|           Toy Story|Children's|        1995|
|       1|           Toy Story|    Comedy|        1995|
|       2|             Jumanji| Adventure|        1995|
|       2|             Jumanji|Children's|        1995|
|       2|             Jumanji|   Fantasy|        1995|
|       3|    Grumpier Old Men|    Comedy|        1995|
|       3|    Grumpier Old Men|   Romance|        1995|
|       4|   Waiting to Exhale|    Comedy|        1995|
|       4|   Waiting to Exhale|     Drama|        1995|
|       5|Father of the Bri...|    Comedy|        1995|
|       6|                Heat|    Action|        1995|
|       6|                Heat|     Crime|        1995|
|       6|                Heat|  Thriller|        1995|
|       7|             Sabrina|    Comedy|      

In [41]:
movies_data

DataFrame[movie_id: int, title: string, genres: string, release_year: string]

In [42]:
# path = "D:/PROJECTS/Movie_Analytics/Data/transformed/"
# movies_data.write.csv(path, header=True, mode="overwrite")

In [43]:
movies_data.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = false)
 |-- release_year: string (nullable = true)



In [49]:
from pyspark.sql.functions import count
count_dis = movies_data.distinct().count()
print(count_dis)

6408


In [50]:
count = movies_data.count()
print(count)

6408


In [51]:
duplicate_count = count - count_dis
print(duplicate_count)

0


In [17]:
movies_data.count()

6408

In [52]:
movies_data.distinct().count()

6408

In [18]:
users_data.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- ZipCode: integer (nullable = true)



In [19]:
users_data

DataFrame[UserID: int, Gender: string, Age: int, Occupation: int, ZipCode: int]

In [20]:
users_data.show()

+------+------+---+----------+-------+
|UserID|Gender|Age|Occupation|ZipCode|
+------+------+---+----------+-------+
|     1|     F|  1|        10|  48067|
|     2|     M| 56|        16|  70072|
|     3|     M| 25|        15|  55117|
|     4|     M| 45|         7|   2460|
|     5|     M| 25|        20|  55455|
|     6|     F| 50|         9|  55117|
|     7|     M| 35|         1|   6810|
|     8|     M| 25|        12|  11413|
|     9|     M| 25|        17|  61614|
|    10|     F| 35|         1|  95370|
|    11|     F| 25|         1|   4093|
|    12|     M| 25|        12|  32793|
|    13|     M| 45|         1|  93304|
|    14|     M| 35|         0|  60126|
|    15|     M| 25|         7|  22903|
|    16|     F| 35|         0|  20670|
|    17|     M| 50|         1|  95350|
|    18|     F| 18|         3|  95825|
|    19|     M|  1|        10|  48073|
|    20|     M| 25|        14|  55113|
+------+------+---+----------+-------+
only showing top 20 rows



In [21]:
ratings_data.show()

+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
|     1|    914|     3|978301968|
|     1|   3408|     4|978300275|
|     1|   2355|     5|978824291|
|     1|   1197|     3|978302268|
|     1|   1287|     5|978302039|
|     1|   2804|     5|978300719|
|     1|    594|     4|978302268|
|     1|    919|     4|978301368|
|     1|    595|     5|978824268|
|     1|    938|     4|978301752|
|     1|   2398|     4|978302281|
|     1|   2918|     4|978302124|
|     1|   1035|     5|978301753|
|     1|   2791|     4|978302188|
|     1|   2687|     3|978824268|
|     1|   2018|     4|978301777|
|     1|   3105|     5|978301713|
|     1|   2797|     4|978302039|
+------+-------+------+---------+
only showing top 20 rows



In [22]:
ratings_data.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- MovieID: integer (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Timestamp: integer (nullable = true)



In [23]:
movies_data.createOrReplaceTempView("movies")
users_data.createOrReplaceTempView("users")
ratings_data.createOrReplaceTempView("ratings")


In [24]:
spark.sql("select * from users").show()

+------+------+---+----------+-------+
|UserID|Gender|Age|Occupation|ZipCode|
+------+------+---+----------+-------+
|     1|     F|  1|        10|  48067|
|     2|     M| 56|        16|  70072|
|     3|     M| 25|        15|  55117|
|     4|     M| 45|         7|   2460|
|     5|     M| 25|        20|  55455|
|     6|     F| 50|         9|  55117|
|     7|     M| 35|         1|   6810|
|     8|     M| 25|        12|  11413|
|     9|     M| 25|        17|  61614|
|    10|     F| 35|         1|  95370|
|    11|     F| 25|         1|   4093|
|    12|     M| 25|        12|  32793|
|    13|     M| 45|         1|  93304|
|    14|     M| 35|         0|  60126|
|    15|     M| 25|         7|  22903|
|    16|     F| 35|         0|  20670|
|    17|     M| 50|         1|  95350|
|    18|     F| 18|         3|  95825|
|    19|     M|  1|        10|  48073|
|    20|     M| 25|        14|  55113|
+------+------+---+----------+-------+
only showing top 20 rows



In [25]:
top_viewed_movies = spark.sql("SELECT MovieID, COUNT(*) as Views FROM ratings GROUP BY MovieID ORDER BY Views DESC LIMIT 10")

In [26]:
top_viewed_movies.show()

+-------+-----+
|MovieID|Views|
+-------+-----+
|   2858| 3428|
|    260| 2991|
|   1196| 2990|
|   1210| 2883|
|    480| 2672|
|   2028| 2653|
|    589| 2649|
|   2571| 2590|
|   1270| 2583|
|    593| 2578|
+-------+-----+



In [27]:
distinct_genres = spark.sql('select distinct genres from movies order by genres')

In [28]:
distinct_genres.show()

+-----------+
|     genres|
+-----------+
|     Action|
|  Adventure|
|  Animation|
| Children's|
|     Comedy|
|      Crime|
|Documentary|
|      Drama|
|    Fantasy|
|  Film-Noir|
|     Horror|
|    Musical|
|    Mystery|
|    Romance|
|     Sci-Fi|
|   Thriller|
|        War|
|    Western|
+-----------+



In [29]:
movies_Pereach_genres = spark.sql('select genres,count(title) as num_movies from movies group by genres')

In [30]:
movies_Pereach_genres.show()

+-----------+----------+
|     genres|num_movies|
+-----------+----------+
|      Crime|       211|
|    Romance|       471|
|   Thriller|       492|
|  Adventure|       283|
| Children's|       251|
|      Drama|      1603|
|        War|       143|
|Documentary|       127|
|    Fantasy|        68|
|    Mystery|       106|
|    Musical|       114|
|  Animation|       105|
|  Film-Noir|        44|
|     Horror|       343|
|    Western|        68|
|     Comedy|      1200|
|     Action|       503|
|     Sci-Fi|       276|
+-----------+----------+



In [31]:
latest_released = spark.sql('select distinct title,release_year from movies order by release_year desc')

In [32]:
latest_released.show()

+--------------------+------------+
|               title|release_year|
+--------------------+------------+
|     Mission to Mars|        2000|
|Butterfly (La Len...|        2000|
|Flintstones in Vi...|        2000|
|           Supernova|        2000|
|  Where the Heart Is|        2000|
|   Bittersweet Motel|        2000|
| Rules of Engagement|        2000|
|        Patriot, The|        2000|
| Remember the Titans|        2000|
|                Bait|        2000|
|     Erin Brockovich|        2000|
|         Down to You|        2000|
|Original Kings of...|        2000|
|               U-571|        2000|
|        Running Free|        2000|
| Way of the Gun, The|        2000|
|     Art of War, The|        2000|
|   Replacements, The|        2000|
|            Kid, The|        2000|
|          Bamboozled|        2000|
+--------------------+------------+
only showing top 20 rows



In [33]:
temp = spark.sql('select * from movies where release_year = 1919')

In [34]:
temp.show()

+--------+--------------------+---------+------------+
|movie_id|               title|   genres|release_year|
+--------+--------------------+---------+------------+
|    2821|     Male and Female|Adventure|        1919|
|    2821|     Male and Female|    Drama|        1919|
|    2823|Spiders, The (Die...|   Action|        1919|
|    2823|Spiders, The (Die...|    Drama|        1919|
|    3132|     Daddy Long Legs|   Comedy|        1919|
+--------+--------------------+---------+------------+



In [35]:
movies_released_per_year = spark.sql('select release_year,count(title) as num_movies from movies group by release_year')

In [36]:
movies_released_per_year.show()

+------------+----------+
|release_year|num_movies|
+------------+----------+
|        1953|        27|
|        1957|        30|
|        1987|       128|
|        1956|        27|
|        1936|         8|
|        1958|        39|
|        1943|        17|
|        1972|        31|
|        1931|        11|
|        1988|       114|
|        1938|        11|
|        1926|         9|
|        1932|        11|
|        1977|        42|
|        1971|        42|
|        1984|       109|
|        1982|        83|
|        1941|        18|
|        1919|         5|
|        2000|       247|
+------------+----------+
only showing top 20 rows



In [37]:
movies_Pereach_genres.show()

+-----------+----------+
|     genres|num_movies|
+-----------+----------+
|      Crime|       211|
|    Romance|       471|
|   Thriller|       492|
|  Adventure|       283|
| Children's|       251|
|      Drama|      1603|
|        War|       143|
|Documentary|       127|
|    Fantasy|        68|
|    Mystery|       106|
|    Musical|       114|
|  Animation|       105|
|  Film-Noir|        44|
|     Horror|       343|
|    Western|        68|
|     Comedy|      1200|
|     Action|       503|
|     Sci-Fi|       276|
+-----------+----------+

