# Extract necessary DataFrames

In [None]:
from pyspark.sql import Window
import pyspark.sql.functions as F
from main import process_imdb_data
from pyspark.sql import DataFrame
from pyspark.sql.functions import desc

In [None]:
dataframes = process_imdb_data()

In [11]:
videogames = dataframes["basics"].filter(F.col("titleType") == "videoGame")
tv_episodes = dataframes["basics"].filter(F.col("titleType") == "tvEpisode")
ratings = dataframes["ratings"]
akas = dataframes["akas"]
principals = dataframes["principals"]
names = dataframes["name"]

# Which videos or episodes have Ukrainian localization or mention 'Ukraine' in the title and have the highest rating (above 7,5)?

## Які відео або епізоди мають українську локалізацію або згадку "Ukraine" в назві і мають найвищий рейтинг(білше 1,5)?



In [16]:
def highest_rated_ukraine_titles(videogames: DataFrame, tv_episodes: DataFrame) -> DataFrame:
    ukraine_titles = videogames.filter(
        F.col("primaryTitle").contains("Ukraine") | F.col("originalTitle").contains("Ukraine")
    ).union(
        tv_episodes.filter(
            F.col("primaryTitle").contains("Ukraine") | F.col("originalTitle").contains("Ukraine")
        )
    )

    highest_rated_ukraine = ukraine_titles.join(
        ratings, ukraine_titles["tconst"] == ratings["tconst"]
    ).filter(
        (F.col("averageRating") >= 7.5) & (F.col("numVotes") >= 20)
    ).select(
        "originalTitle", "averageRating", "numVotes", "startYear"
    ).orderBy(F.desc("averageRating"))

    return highest_rated_ukraine

In [17]:
print("Highest rated titles from Ukraine")

highest_rated_ukraine = highest_rated_ukraine_titles(videogames, tv_episodes)
highest_rated_ukraine.show(truncate=False)

total_records = highest_rated_ukraine.count()
print(f"Загальна кількість записів у відповіді: {total_records}")

Highest rated titles from Ukraine
+------------------------------------------------------------------------+-------------+---------+
|originalTitle                                                           |averageRating|startYear|
+------------------------------------------------------------------------+-------------+---------+
|Hero City Mariupol 10: Ukraine's Gloria Stefania of AzovStal            |9.7          |2022     |
|Kadyrovtsy Last Trip to Ukraine                                         |9.7          |2022     |
|Ukraine Victorious 27: Kharkiv Champion Iryna Dekha                     |9.7          |2023     |
|Ukraine Victorious 24: Bakhmut Star Mariia Yefremova                    |9.7          |2023     |
|Ukraine Victorious 20: Top of the World Wrestlers                       |9.7          |2022     |
|Ukraine The Victorious 29: Football Ladies EURO Finalists               |9.7          |2023     |
|Bombs as Russia's Killing Culture 2: Deep Wounds on Ukraine's Body      |9

# Which video game genres are the most popular by the number of released games in the last 5 years, and what is the most representative title for each genre?

## Які жанри відеоігор є найпопулярнішими за кількістю випущених ігор за останні 5 років, і яка назва є найбільш репрезентативною для кожного жанру?

In [18]:
def most_popular_genres_by_count(videogames: DataFrame) -> DataFrame:
    current_year = 2025
    videogames_last_5_years = videogames.filter(
        (F.col("startYear") >= current_year - 5) & (F.col("startYear").isNotNull())
    )

    videogames_last_5_years = videogames_last_5_years.withColumn(
        "genre", F.explode(F.col("genres"))
    )

    genre_counts = videogames_last_5_years.groupBy("genre").count()

    most_frequent_title_in_genre = videogames_last_5_years.groupBy("genre").agg(
        F.first("primaryTitle").alias("mostFrequentTitle"))

    genre_summary = genre_counts.join(
        most_frequent_title_in_genre, "genre", "left"
    )

    top_genres_by_count = genre_summary.orderBy(desc("count")).limit(20)

    return top_genres_by_count


In [19]:
print("Most popular video games genres by count")

most_popular_video_games_genres = most_popular_genres_by_count(videogames)
most_popular_video_games_genres.show(truncate=False)

total_records = most_popular_video_games_genres.count()
print(f"Загальна кількість записів у відповіді: {total_records}")

Most popular video games genres by count
+---------+-----+--------------------------------------+
|genre    |count|mostFrequentTitle                     |
+---------+-----+--------------------------------------+
|Adventure|2651 |Croc: Legend of the Gobbos            |
|Action   |2641 |Croc: Legend of the Gobbos            |
|Fantasy  |1084 |Vampire: The Masquerade - Bloodlines 2|
|Horror   |957  |Gamer Girl                            |
|Sci-Fi   |555  |The Complex                           |
|Comedy   |509  |Lego Star Wars: The Skywalker Saga    |
|Mystery  |349  |Twin Mirror                           |
|Animation|318  |Love, Money, Rock 'n' Roll.           |
|Drama    |284  |Vampire: The Masquerade - Bloodlines 2|
|Romance  |267  |Love, Money, Rock 'n' Roll.           |
|Sport    |230  |MLB: The Show 20                      |
|Family   |228  |Croc: Legend of the Gobbos            |
|Adult    |192  |Dezyred - Double Trouble              |
|Crime    |175  |The Legend of Blue Jacket & Re

# Which video games with the highest ratings (above 7,5) were released after 2015 and have more than 10000 votes?

## Які відеоігри з найвищим рейтингом (більше 7,5) вийшли після 2015 року та мають понад 10000 голосів?

In [20]:
def top_recent_videogames(videogames: DataFrame, ratings: DataFrame) -> DataFrame:
    recent_videogames = videogames.join(ratings, "tconst") \
        .filter(
            (videogames["startYear"] >= 2015) &
            (ratings["averageRating"] >= 7.5) &
            (ratings["numVotes"] >= 10000)
        ) \
        .select(
            videogames["primaryTitle"],
            videogames["startYear"],
            ratings["averageRating"],
            ratings["numVotes"]
        )

    top_recent = recent_videogames.orderBy(F.desc("averageRating"), F.desc("numVotes"))

    return top_recent


In [21]:
print("Top-rated video games released after 2015 with more than 10000 votes")

top_recent_games = top_recent_videogames(videogames, ratings)
top_recent_games.show(truncate=False)

total_recent = top_recent_games.count()
print(f"Загальна кількість записів у відповіді: {total_recent}")


Top-rated video games released after 2015 with more than 5000 votes
+---------------------------------------+---------+-------------+--------+
|primaryTitle                           |startYear|averageRating|numVotes|
+---------------------------------------+---------+-------------+--------+
|Red Dead Redemption II                 |2018     |9.7          |65236   |
|The Last of Us: Part I                 |2022     |9.7          |10384   |
|The Witcher 3: Wild Hunt               |2015     |9.6          |36464   |
|God of War: Ragnarök                   |2022     |9.6          |21222   |
|Baldur's Gate III                      |2023     |9.6          |7413    |
|God of War                             |2018     |9.5          |41364   |
|Uncharted 4: A Thief's End             |2016     |9.4          |39295   |
|Elden Ring                             |2022     |9.4          |12920   |
|The Legend of Zelda: Breath of the Wild|2017     |9.4          |11295   |
|Bloodborne                     

# Which video games have the most localizations (adaptations for different countries)?
## Які відеоігри мають найбільшу кількість локалізацій (адаптацій для різних країн)?

In [22]:
def most_localized_videogames(videogames: DataFrame, akas: DataFrame) -> DataFrame:
    localized_games = videogames.join(
        akas, videogames["tconst"] == akas["titleId"], "inner"
    )

    localized_games = localized_games.filter(
        (F.col("language") != "und") &
        (F.col("titleType") == "videoGame")
    )

    localized_games_count = localized_games.groupBy("primaryTitle").agg(
        F.countDistinct("region").alias("localization_count")
    )

    result = localized_games_count.orderBy(desc("localization_count"))

    return result


In [23]:
print("Most localized video games")

localized_games = most_localized_videogames(videogames, akas)
localized_games.show(truncate=False)

total_records = localized_games.count()
print(f"Total number of records in the result: {total_records}")

Most localized video games
+--------------------------------------------------------------+------------------+
|primaryTitle                                                  |localization_count|
+--------------------------------------------------------------+------------------+
|Atelier Yumia: The Alchemist of Memories & the Envisioned Land|17                |
|Monkey King: Hero Is Back                                     |16                |
|SilverFin                                                     |11                |
|Paradigm Paradox                                              |11                |
|Pokémon Legends Z-A                                           |11                |
|Max Payne                                                     |9                 |
|James Bond 007: Goldfinger                                    |9                 |
|James Bond 007: A View to a Kill                              |8                 |
|Sword Art Online: Hollow Fragment               

# Which video games have the highest rating growth in the last year?
## Які відеоігри з найбільшим приростом рейтингу в останньому році?

In [26]:
def top_video_games_by_rating_growth(videogames: DataFrame, ratings: DataFrame) -> DataFrame:
    joined_data = videogames.join(ratings, videogames["tconst"] == ratings["tconst"])

    window_spec = Window.partitionBy("primaryTitle").orderBy("startYear")

    joined_data = joined_data.withColumn(
        "previous_rating",
        F.lag("averageRating").over(window_spec)
    )

    joined_data = joined_data.withColumn(
        "rating_growth", F.round(F.col("averageRating") - F.col("previous_rating"), 2)
    )

    top_growth_games = joined_data.filter(
        (F.col("rating_growth") > 0) & (F.col("numVotes") >= 500)
    ).select(
        "primaryTitle", "startYear", "averageRating", "numVotes", "rating_growth"
    ).orderBy(
        F.desc("rating_growth")
    )

    return top_growth_games


In [27]:
print("Top video games by rating growth")

top_rating_growth_games = top_video_games_by_rating_growth(videogames, ratings)
top_rating_growth_games.show(truncate=False)

total_records = top_rating_growth_games.count()
print(f"Total number of records in the result: {total_records}")

Top video games by rating growth
+------------------------------+---------+-------------+------------------+
|primaryTitle                  |startYear|averageRating|rating_growth     |
+------------------------------+---------+-------------+------------------+
|Wipeout                       |1995     |7.5          |4.7               |
|Peggle                        |2007     |7.5          |3.9               |
|Basketball                    |1978     |6.4          |3.6000000000000005|
|God of War                    |2018     |9.5          |3.2               |
|The Pagemaster                |1994     |7.2          |3.1000000000000005|
|Fist of the North Star        |1995     |6.2          |2.9000000000000004|
|Blasto                        |1998     |8.1          |2.8999999999999995|
|Stronghold                    |2001     |8.1          |2.8999999999999995|
|Freedom Force                 |2002     |7.6          |2.8               |
|T2: Terminator 2: Judgment Day|1992     |6.0          

# Which actors received the most roles in the highest-rated TV episodes in a year?
## Які актори отримали найбільше ролей за рік у найбільш рейтингових ТВ-епізодах?

In [28]:
def top_actors_in_highest_rated_tv_episodes(tv_episodes: DataFrame, principals: DataFrame, ratings: DataFrame,
                                            names: DataFrame) -> DataFrame:
    tv_with_ratings = tv_episodes.join(ratings, tv_episodes["tconst"] == ratings["tconst"], "inner").select(
        tv_episodes["tconst"].alias("tv_tconst"), tv_episodes["startYear"], ratings["averageRating"])

    window_spec = Window.partitionBy("startYear").orderBy(F.desc("averageRating"))

    tv_with_ratings = tv_with_ratings.withColumn("rank", F.dense_rank().over(window_spec))

    top_episodes = tv_with_ratings.filter(F.col("rank") == 1)

    top_actors = top_episodes.join(principals, top_episodes["tv_tconst"] == principals["tconst"], "inner").select(
        principals["nconst"].alias("actor_nconst"), principals["category"], top_episodes["averageRating"],
        top_episodes["startYear"])

    top_actors = top_actors.filter(F.col("category") == "actor")

    top_actors_with_names = top_actors.join(names, top_actors["actor_nconst"] == names["nconst"], "inner").select(
        names["primaryName"], top_actors["actor_nconst"], "averageRating", top_actors["startYear"])

    actor_role_count = top_actors_with_names.groupBy("primaryName", "startYear").agg(
        F.count("actor_nconst").alias("roleCount"), F.avg("averageRating").alias("avgRating")).orderBy(
        F.desc("roleCount"), F.desc("avgRating"))

    return actor_role_count


In [29]:
print("Top actors in highest rated TV episodes with names and ratings")

top_actors_highest_rated = top_actors_in_highest_rated_tv_episodes(tv_episodes, principals, ratings, names)
top_actors_highest_rated.show(truncate=False)

total_records = top_actors_highest_rated.count()
print(f"Total number of records in the result: {total_records}")

Top actors in highest rated TV episodes with names and ratings
+-------------------+---------+---------+---------+
|primaryName        |startYear|roleCount|avgRating|
+-------------------+---------+---------+---------+
|Maurice Carpede    |2001     |254      |10.0     |
|Maurice Carpede    |2000     |210      |10.0     |
|Tyrone Keogh       |2012     |160      |10.0     |
|Tyrone Keogh       |2011     |148      |10.0     |
|Adem Bal           |2024     |119      |10.0     |
|Eduardo Yáñez      |2013     |85       |10.0     |
|Sebastián Rulli    |2013     |84       |10.0     |
|Guillermo Capetillo|2013     |83       |10.0     |
|Enrique Rocha      |2013     |79       |10.0     |
|Francisco Gattorno |2013     |79       |10.0     |
|Tarik Tiryakioglu  |2024     |79       |10.0     |
|Ayaz Ahmed         |2014     |75       |10.0     |
|Dishank Arora      |2014     |75       |10.0     |
|Pulkit Bangia      |2014     |75       |10.0     |
|Utkarsh Gupta      |2014     |75       |10.0     |
|

# Saving to csv

In [32]:
highest_rated_ukraine_titles(videogames, tv_episodes).coalesce(10).write.option("header", "true").csv(
    "/data/results/highest_rated_ukraine_titles.csv")

most_popular_genres_by_count(videogames).coalesce(1).write.option("header", "true").csv(
    "/data/results/most_popular_video_games_genres.csv")

top_recent_videogames(videogames, ratings).coalesce(1).write.option("header", "true").csv(
    "/data/results/top_recent_video_games.csv")

most_localized_videogames(videogames, akas).coalesce(1).write.option("header", "true").csv(
    "/data/results/most_localized_video_games.csv")

top_video_games_by_rating_growth(videogames, ratings).coalesce(1).write.option("header", "true").csv(
    "/data/results/top_video_games_by_rating_growth.csv")

top_actors_in_highest_rated_tv_episodes(tv_episodes, principals, ratings, names).coalesce(1).write.option("header",
                                                                                                          "true").csv(
    "/data/results/top_actors_highest_rated_tv_episodes.csv")
