# Extract necessary DataFrames

In [8]:
from pyspark.sql import Window
import pyspark.sql.functions as F
from main import process_imdb_data
from pyspark.sql import DataFrame
from pyspark.sql.functions import desc

In [9]:
dataframes = process_imdb_data()

In [None]:
videogames = dataframes["basics"].filter(F.col("titleType") == "videoGame")
tv_episodes = dataframes["basics"].filter(F.col("titleType") == "tvEpisode")
ratings = dataframes["ratings"]
akas = dataframes["akas"]
principals = dataframes["principals"]
names = dataframes["name"]

# Export results

In [None]:
def export_result(df: DataFrame, result_path: str, show_rows: int = 20, title: str = None,
                   partition_column: str = None, num_partitions: int = None) -> None:
     print(f"Results for: {title}")
     df.show(show_rows, truncate=False)
     total_records = df.count()
     print(f"Total number of records in the result: {total_records}")

     for col_name, col_type in df.dtypes:
         if col_type.startswith("array"):
             df = df.withColumn(col_name, F.concat_ws(", ", F.col(col_name)))

     writer = df.write.option("header", "true")

     if partition_column and partition_column in df.columns:
         print(f"Exporting results with partitioning by column: {partition_column}")
         writer.partitionBy(partition_column).csv(result_path, mode="overwrite")
     elif num_partitions and num_partitions > 0:
         print(f"Exporting results with {num_partitions} partitions")
         df.repartition(num_partitions).write.option("header", "true").csv(result_path, mode="overwrite")
     else:
         target_size_mb = 128
         estimated_size_bytes = total_records * len(df.columns) * 100
         estimated_size_mb = estimated_size_bytes / (1024 * 1024)
         optimal_partitions = max(1, int(estimated_size_mb / target_size_mb))

         print(f"Exporting results with automatically determined {optimal_partitions} partitions")
         df.repartition(optimal_partitions).write.option("header", "true").csv(result_path, mode="overwrite")

# Which videos or episodes have Ukrainian localization or mention 'Ukraine' in the title and have the highest rating (above 7,5)?

## Які відео або епізоди мають українську локалізацію або згадку "Ukraine" в назві і мають найвищий рейтинг(більше 1,5)?



In [14]:
def highest_rated_ukraine_titles(videogames: DataFrame, tv_episodes: DataFrame, ratings: DataFrame) -> DataFrame:
    ukraine_titles = videogames.filter(
        F.col("primaryTitle").contains("Ukraine") | F.col("originalTitle").contains("Ukraine")
    ).union(
        tv_episodes.filter(
            F.col("primaryTitle").contains("Ukraine") | F.col("originalTitle").contains("Ukraine")
        )
    )

    highest_rated_ukraine = ukraine_titles.join(
        ratings, ukraine_titles["tconst"] == ratings["tconst"]
    ).filter(
        (F.col("averageRating") >= 7.5) & (F.col("numVotes") >= 20)
    ).select(
        "originalTitle", "averageRating", "numVotes", "startYear"
    ).orderBy(F.desc("averageRating"))

    return highest_rated_ukraine


In [15]:
highest_rated_ukraine = highest_rated_ukraine_titles(videogames, tv_episodes, ratings)
export_result(highest_rated_ukraine,"/data/results/highest_rated_ukraine_titles.csv",title="Highest rated titles from Ukraine")

Results for: Highest rated titles from Ukraine
+-------------------------------------------------+-------------+--------+---------+
|originalTitle                                    |averageRating|numVotes|startYear|
+-------------------------------------------------+-------------+--------+---------+
|Ukraine                                          |8.4          |63      |2016     |
|Trip Through Ukraine In Compact Hatchbacks       |8.3          |496     |2014     |
|Moissons sanglantes:1933, la famine en Ukraine   |8.0          |36      |2023     |
|Ukraine: Les masques de la révolution            |8.0          |60      |2016     |
|Putin's Attack on Ukraine: Documenting War Crimes|7.8          |79      |2022     |
|Group C: Netherlands vs Ukraine                  |7.7          |41      |2021     |
|Ukraine                                          |7.7          |72      |2011     |
|Ukraine: Life Under Russia's Attack              |7.6          |57      |2023     |
|The Battle for Uk

# Which video game genres are the most popular by the number of released games in the last 5 years, and what is the most representative title for each genre?

## Які жанри відеоігор є найпопулярнішими за кількістю випущених ігор за останні 5 років, і яка назва є найбільш репрезентативною для кожного жанру?

In [16]:
def most_popular_genres_by_count(videogames: DataFrame) -> DataFrame:
    current_year = 2025
    videogames_last_5_years = videogames.filter(
        (F.col("startYear") >= current_year - 5) & (F.col("startYear").isNotNull())
    )

    videogames_last_5_years = videogames_last_5_years.withColumn(
        "genre", F.explode(F.col("genres"))
    )

    genre_counts = videogames_last_5_years.groupBy("genre").count()

    most_frequent_title_in_genre = videogames_last_5_years.groupBy("genre") \
        .agg(F.first("primaryTitle").alias("mostFrequentTitle"))

    genre_summary = genre_counts.join(
        most_frequent_title_in_genre, "genre", "left"
    )

    top_genres_by_count = genre_summary.orderBy(desc("count")).limit(20)

    return top_genres_by_count


In [17]:
most_popular_video_games_genres=most_popular_genres_by_count(videogames)
export_result(most_popular_video_games_genres,"/data/results/most_popular_video_games_genres.csv",title="Most popular video games genres by count")

Results for: Most popular video games genres by count
+---------+-----+--------------------------------------+
|genre    |count|mostFrequentTitle                     |
+---------+-----+--------------------------------------+
|Adventure|2656 |Croc: Legend of the Gobbos            |
|Action   |2645 |Croc: Legend of the Gobbos            |
|Fantasy  |1086 |Vampire: The Masquerade - Bloodlines 2|
|Horror   |957  |Gamer Girl                            |
|Sci-Fi   |559  |The Complex                           |
|Comedy   |512  |Lego Star Wars: The Skywalker Saga    |
|Mystery  |351  |Twin Mirror                           |
|Animation|319  |Love, Money, Rock 'n' Roll.           |
|Drama    |283  |Vampire: The Masquerade - Bloodlines 2|
|Romance  |268  |Love, Money, Rock 'n' Roll.           |
|Sport    |231  |MLB: The Show 20                      |
|Family   |228  |Croc: Legend of the Gobbos            |
|Adult    |193  |Dezyred - Double Trouble              |
|Crime    |175  |The Legend of Blu

# Which video games with the highest ratings (above 7,5) were released after 2015 and have more than 10000 votes?

## Які відеоігри з найвищим рейтингом (більше 7,5) вийшли після 2015 року та мають понад 10000 голосів?

In [18]:
def top_recent_videogames(videogames: DataFrame, ratings: DataFrame) -> DataFrame:
    recent_videogames = videogames.join(ratings, "tconst") \
        .filter(
            (videogames["startYear"] >= 2015) &
            (ratings["averageRating"] >= 7.5) &
            (ratings["numVotes"] >= 10000)
        ) \
        .select(
            videogames["primaryTitle"],
            videogames["startYear"],
            ratings["averageRating"],
            ratings["numVotes"]
        )

    top_recent = recent_videogames.orderBy(F.desc("averageRating"), F.desc("numVotes"))

    return top_recent


In [19]:
top_recent_games = top_recent_videogames(videogames, ratings)
export_result(top_recent_games,"/data/results/top_recent_games.csv",title="Top-rated video games released after 2015 with more than 10000 votes")

Results for: Top-rated video games released after 2015 with more than 10000 votes
+---------------------------------------+---------+-------------+--------+
|primaryTitle                           |startYear|averageRating|numVotes|
+---------------------------------------+---------+-------------+--------+
|Red Dead Redemption II                 |2018     |9.7          |65546   |
|The Last of Us: Part I                 |2022     |9.7          |10470   |
|The Witcher 3: Wild Hunt               |2015     |9.6          |36509   |
|God of War: Ragnarök                   |2022     |9.6          |21299   |
|God of War                             |2018     |9.5          |41438   |
|Uncharted 4: A Thief's End             |2016     |9.4          |39360   |
|Elden Ring                             |2022     |9.4          |12974   |
|The Legend of Zelda: Breath of the Wild|2017     |9.4          |11322   |
|Spider-Man                             |2018     |9.2          |32857   |
|Ghost of Tsushima

# Which video games have the most localizations (adaptations for different countries)?
## Які відеоігри мають найбільшу кількість локалізацій (адаптацій для різних країн)?

In [20]:
def most_localized_videogames(videogames: DataFrame, akas: DataFrame) -> DataFrame:
    localized_games = videogames.join(
        akas, videogames["tconst"] == akas["titleId"], "inner"
    )

    localized_games = localized_games.filter(
        (F.col("language") != "und") &
        (F.col("titleType") == "videoGame")
    )

    localized_games_count = localized_games.groupBy("primaryTitle").agg(
        F.countDistinct("region").alias("localization_count")
    )

    result = localized_games_count.orderBy(desc("localization_count"))

    return result


In [21]:
localized_games = most_localized_videogames(videogames, akas)
export_result(localized_games,"/data/results/localized_games.csv",title="Most localized video games")

Results for: Most localized video games
+--------------------------------------------------------------+------------------+
|primaryTitle                                                  |localization_count|
+--------------------------------------------------------------+------------------+
|Atelier Yumia: The Alchemist of Memories & the Envisioned Land|17                |
|Monkey King: Hero Is Back                                     |16                |
|SilverFin                                                     |11                |
|Paradigm Paradox                                              |11                |
|Pokémon Legends Z-A                                           |11                |
|Max Payne                                                     |9                 |
|James Bond 007: Goldfinger                                    |9                 |
|James Bond 007: A View to a Kill                              |8                 |
|Sword Art Online: Hollow Fragment  

# Which video games have the highest rating growth in the last year?
## Які відеоігри з найбільшим приростом рейтингу в останньому році?

In [22]:
def top_video_games_by_rating_growth(videogames: DataFrame, ratings: DataFrame) -> DataFrame:
    joined_data = videogames.join(ratings, videogames["tconst"] == ratings["tconst"])

    window_spec = Window.partitionBy("primaryTitle").orderBy("startYear")

    joined_data = joined_data.withColumn(
        "previous_rating",
        F.lag("averageRating").over(window_spec)
    )

    joined_data = joined_data.withColumn(
        "rating_growth", F.round(F.col("averageRating") - F.col("previous_rating"), 2)
    )

    top_growth_games = joined_data.filter(
        (F.col("rating_growth") > 0) & (F.col("numVotes") >= 500)
    ).select(
        "primaryTitle", "startYear", "averageRating", "numVotes", "rating_growth"
    ).orderBy(
        F.desc("rating_growth")
    )
    return top_growth_games


In [23]:
top_rating_growth_games = top_video_games_by_rating_growth(videogames, ratings)
export_result(top_rating_growth_games,"/data/results/top_rating_growth_games.csv",title="Top video games by rating growth")

Results for: Top video games by rating growth
+-----------------------------------+---------+-------------+--------+-------------+
|primaryTitle                       |startYear|averageRating|numVotes|rating_growth|
+-----------------------------------+---------+-------------+--------+-------------+
|God of War                         |2018     |9.5          |41438   |3.2          |
|Journey                            |2012     |8.5          |4443    |2.7          |
|Perfect Dark                       |2000     |8.6          |1705    |2.7          |
|Neverwinter Nights                 |2002     |7.8          |1175    |2.6          |
|Mortal Kombat 4                    |1997     |6.9          |1646    |2.1          |
|Mad Max                            |2015     |7.9          |4416    |2.0          |
|Spider-Man                         |2018     |9.2          |32857   |2.0          |
|Batman                             |1989     |7.4          |531     |1.8          |
|Spider-Man 2      

# Which actors received the most roles in the highest-rated TV episodes in a year?
## Які актори отримали найбільше ролей за рік у найбільш рейтингових ТВ-епізодах?

In [10]:
def top_actors_in_highest_rated_tv_episodes(tv_episodes: DataFrame, principals: DataFrame, ratings: DataFrame, names: DataFrame) -> DataFrame:
    tv_with_ratings = tv_episodes.join(ratings, tv_episodes["tconst"] == ratings["tconst"], "inner") \
        .select(tv_episodes["tconst"].alias("tv_tconst"), tv_episodes["startYear"], ratings["averageRating"])

    window_spec = Window.partitionBy("startYear").orderBy(F.desc("averageRating"))

    tv_with_ratings = tv_with_ratings.withColumn("rank", F.dense_rank().over(window_spec))

    top_episodes = tv_with_ratings.filter(F.col("rank") == 1)

    top_actors = top_episodes.join(principals, top_episodes["tv_tconst"] == principals["tconst"], "inner") \
        .select(principals["nconst"].alias("actor_nconst"), principals["category"], top_episodes["averageRating"], top_episodes["startYear"])

    top_actors = top_actors.filter(F.col("category") == "actor")

    top_actors_with_names = top_actors.join(names, top_actors["actor_nconst"] == names["nconst"], "inner") \
                                     .select(names["primaryName"], top_actors["actor_nconst"], "averageRating", top_actors["startYear"])

    actor_role_count = top_actors_with_names.groupBy("primaryName", "startYear") \
                                            .agg(F.count("actor_nconst").alias("roleCount"), F.avg("averageRating").alias("avgRating")) \
                                            .orderBy(F.desc("roleCount"), F.desc("avgRating"))

    return actor_role_count


In [13]:
top_actors_highest_rated = top_actors_in_highest_rated_tv_episodes(tv_episodes, principals, ratings, names)
export_result(top_actors_highest_rated,"/data/results/top_actors_highest_rated_tv_episodes.csv",title="Top actors in highest rated TV episodes with names and ratings")

Results for: Top actors in highest rated TV episodes with names and ratings
+-------------------+---------+---------+---------+
|primaryName        |startYear|roleCount|avgRating|
+-------------------+---------+---------+---------+
|Maurice Carpede    |2001     |254      |10.0     |
|Maurice Carpede    |2000     |210      |10.0     |
|Tyrone Keogh       |2012     |160      |10.0     |
|Tyrone Keogh       |2011     |148      |10.0     |
|Adem Bal           |2024     |119      |10.0     |
|Eduardo Yáñez      |2013     |85       |10.0     |
|Sebastián Rulli    |2013     |84       |10.0     |
|Guillermo Capetillo|2013     |83       |10.0     |
|Francisco Gattorno |2013     |79       |10.0     |
|Enrique Rocha      |2013     |79       |10.0     |
|Tarik Tiryakioglu  |2024     |79       |10.0     |
|Ayaz Ahmed         |2014     |75       |10.0     |
|Dishank Arora      |2014     |75       |10.0     |
|Pulkit Bangia      |2014     |75       |10.0     |
|Utkarsh Gupta      |2014     |75       