In [62]:
import sys
import os

sys.path.insert(0, os.path.dirname(os.getcwd()))

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, countDistinct, row_number, mean, count
from pyspark.sql.window import Window

from schemas.dataframes import get_basics_df, get_crew_df, get_principals_df, get_ratings_df, get_name_df

In [63]:
spark_session = (SparkSession.builder
                             .master('local')
                             .appName('test app')
                             .config(conf=SparkConf())
                             .getOrCreate())

In [64]:
crew = get_crew_df(spark_session)
basics = get_basics_df(spark_session)
ratings = get_ratings_df(spark_session)
principals = get_principals_df(spark_session)
name = get_name_df(spark_session)

In [65]:
# Director with most movies:
# Who is the director with the most movies, and how many movies have they directed? (Excluding movies with missing values)
result = (
    crew.alias("c").join(basics.alias("b"), col("c.tconst") == col("b.tconst"))
    .filter((col("c.directors") != '\\N') & (col("b.tconst") != '\\N'))
    .groupBy("c.directors")
    .agg(countDistinct("b.tconst").alias("movie_count"))
    .orderBy(desc("movie_count"))
    .limit(1)
)
result.show()


+---------+-----------+
|directors|movie_count|
+---------+-----------+
|nm1203430|      11410|
+---------+-----------+



In [66]:
# Top-3 movies for each genre:
# What are the top three movies in each genre with the highest average ratings, excluding movies with missing values?
result = (
    basics.join(ratings, basics.tconst == ratings.tconst)
    .filter((basics.genres.isNotNull()) & (ratings.averageRating.isNotNull()))
    .groupBy('genres', 'primaryTitle')
    .agg(mean('averageRating').alias('average_rating'))
    .withColumn(
        'rank',
        row_number().over(Window.partitionBy('genres').orderBy(desc('average_rating')))
    )
    .filter(col('rank') <= 3)
)

result.show()


+--------------------+--------------------+-----------------+----+
|              genres|        primaryTitle|   average_rating|rank|
+--------------------+--------------------+-----------------+----+
|              Action|Psychology of Con...|             10.0|   1|
|              Action|      Phoenix Harris|             10.0|   2|
|              Action|In a Dying World ...|             10.0|   3|
|        Action,Adult|        Tourist Trap|8.300000190734863|   1|
|        Action,Adult|Actiongirls.com V...|7.900000095367432|   2|
|        Action,Adult|             Racer X|7.599999904632568|   3|
|Action,Adult,Adve...|San Fernando Jone...|              8.0|   1|
|Action,Adult,Adve...|Laura Crotch, Tom...|7.900000095367432|   2|
|Action,Adult,Adve...|       Kemo Coliseum|7.800000190734863|   3|
|Action,Adult,Anim...|        Kansen Sodom|9.100000381469727|   1|
|Action,Adult,Anim...| Angel Blade Punish!|7.400000095367432|   2|
|Action,Adult,Anim...| Shin Injuu Gakuen 2|6.699999809265137| 

In [67]:
# Popular Genres Over Time
# How has the popularity of different genres changed over the years? Identify the top genre each year.

result = (
    basics.filter((basics.genres.isNotNull()) & (basics.startYear.isNotNull()))
    .groupBy('startYear', 'genres')
    .agg(count('tconst').alias('movie_count'))
    .withColumn(
        'rank',
        row_number().over(Window.partitionBy('startYear').orderBy(desc('movie_count')))
    )
    .filter(col('rank') == 1)
    .orderBy(desc('startYear'))
)

result.show()

+---------+--------------------+-----------+----+
|startYear|              genres|movie_count|rank|
+---------+--------------------+-----------+----+
|     2031|Action,Adventure,...|          1|   1|
|     2030|               Drama|          8|   1|
|     2029|      News,Talk-Show|          6|   1|
|     2028|Adventure,Drama,S...|          1|   1|
|     2027|       Drama,Romance|         11|   1|
|     2026|               Drama|         46|   1|
|     2025|               Drama|         69|   1|
|     2024|               Drama|        790|   1|
|     2023|               Drama|      43943|   1|
|     2022|               Drama|      64477|   1|
|     2021|               Drama|      52401|   1|
|     2020|           Talk-Show|      36871|   1|
|     2019|               Drama|      35374|   1|
|     2018|               Drama|      33745|   1|
|     2017|              Comedy|      34580|   1|
|     2016|              Comedy|      34352|   1|
|     2015|              Comedy|      31661|   1|


In [68]:
# Actor's Most Frequent Job
# For each actor, what is the job they most frequently perform in movies?

result = (
    principals.filter((principals.job != '\\N') & (principals.tconst != '\\N'))
    .groupBy('nconst', 'job')
    .agg(count('tconst').alias('job_count'))
    .withColumn(
        'rank',
        row_number().over(Window.partitionBy('nconst').orderBy(desc('job_count')))
    )
    .filter(col('rank') == 1)
)

result.show()

+---------+--------------------+---------+----+
|   nconst|                 job|job_count|rank|
+---------+--------------------+---------+----+
|nm0000004|              writer|        6|   1|
|nm0000005|            producer|        6|   1|
|nm0000009|                book|        1|   1|
|nm0000016|            composer|        5|   1|
|nm0000018|            producer|       40|   1|
|nm0000019|          screenplay|        9|   1|
|nm0000024|          adaptation|        1|   1|
|nm0000025|            composer|        1|   1|
|nm0000033|      motion picture|       10|   1|
|nm0000036|       autobiography|       12|   1|
|nm0000037|            producer|        2|   1|
|nm0000040|            story by|        1|   1|
|nm0000041|film "Shichinin n...|       26|   1|
|nm0000045|based on the writ...|       30|   1|
|nm0000050|               story|        1|   1|
|nm0000051|               story|        1|   1|
|nm0000054|     personal papers|        1|   1|
|nm0000056|             creator|        

In [69]:
# Movies with Highest Votes
# What are the top three movies with the highest number of votes?

result = (
    ratings.filter((ratings.numVotes.isNotNull()) & (ratings.tconst.isNotNull()))
    .orderBy(desc('numVotes'))
    .limit(3)
)

result.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0111161|          9.3| 2817695|
|tt0468569|          9.0| 2799219|
|tt1375666|          8.8| 2484507|
+---------+-------------+--------+



In [70]:
# Actor's Known Titles
# For each actor, list the titles they are known for, along with the average ratings of those titles.

result = (
    name.alias("n").join(principals.alias("p"), col("n.nconst") == col("p.nconst"))
    .join(ratings.alias("r"), col("p.tconst") == col("r.tconst"))
    .filter((col("n.knownForTitles").isNotNull()) & (col("r.averageRating").isNotNull()))
    .groupBy('n.nconst', 'n.primaryName', 'n.knownForTitles')
    .agg(mean('r.averageRating').alias('average_rating'))
)

result.show()

+---------+--------------------+--------------------+------------------+
|   nconst|         primaryName|      knownForTitles|    average_rating|
+---------+--------------------+--------------------+------------------+
|nm0481248|      Selma Lagerlöf|tt0010298,tt00164...|6.5023529445423796|
|nm0707803|      Esther Ralston|tt0015224,tt00134...| 5.802439032531366|
|nm0007216|       Nancy Carroll|tt0023028,tt00253...| 6.326470641528859|
|nm0111867|    Arthur A. Brooks|tt0030164,tt01837...| 5.305882362758412|
|nm0942639|      Maurice Wright|tt0187669,tt00327...| 5.790740728378296|
|nm0567223|     Paul McCullough|tt0019814,tt03866...| 6.039999985694886|
|nm0615907|    Benito Mussolini|tt0026168,tt00197...|6.9733333150545755|
|nm0001336|          Van Heflin|tt0043938,tt00408...| 6.875000028049245|
|nm0950958|     Victor Sen Yung|tt0032819,tt00323...| 6.824137917880354|
|nm0485943|         Walter Lang|tt0019211,tt00494...|6.3086206912994385|
|nm0555939|        Gloria Marín|tt0235498,tt01348..