In [1]:
import sys
import os


from pyspark import SparkConf
from pyspark.sql import SparkSession, Window
import pyspark.sql.types as t
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, rand, split, explode, regexp_replace, corr, desc, row_number, percent_rank, array_contains, rank

from schemas.dataframes import get_episode_df, get_ratings_df, get_name_df, get_principals_df, get_crew_df, get_akas_df, get_basics_df

D:\big-data-project


In [2]:
spark_session = (SparkSession.builder
                             .master('local')
                             .appName('IMDB')
                             .config(conf=SparkConf())
                             .getOrCreate())

In [3]:
episode_df = get_episode_df(spark_session)
ratings_df = get_ratings_df(spark_session)
name_df = get_name_df(spark_session)
principals_df = get_principals_df(spark_session)
crew_df = get_crew_df(spark_session)
akas_df = get_akas_df(spark_session)
basics_df = get_basics_df(spark_session)

## 1) Топ 20 найпопулярніших анімаційних серіалів 2000+ року

In [11]:
most_popular_animes = (basics_df
                  .filter(basics_df.startYear >= 2000)
                  .filter(array_contains(basics_df.genres, 'Animation'))
                  .filter(basics_df.titleType == 'tvSeries')
                  .join(ratings_df, ratings_df.tconst == basics_df.tconst)
                  .filter(ratings_df.numVotes > 70000)
                  .filter(basics_df.runtimeMinutes < 30)
                  .sort(col('averageRating').desc())
                  .limit(20))

most_popular_animes.show()

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+----------+-------------+--------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|    tconst|averageRating|numVotes|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+----------+-------------+--------+
| tt0417299| tvSeries|Avatar: The Last ...|Avatar: The Last ...|   NULL|     2005|   2008|            23|[Action, Adventur...| tt0417299|          9.3|  348406|
| tt2560140| tvSeries|     Attack on Titan|  Shingeki no Kyojin|   NULL|     2013|   2023|            24|[Action, Adventur...| tt2560140|          9.1|  473468|
| tt2861424| tvSeries|      Rick and Morty|      Rick and Morty|   NULL|     2013|   NULL|            23|[Adventure, Anima...| tt2861424|          9.1|  579498|
| tt1355642| tvSeries|Fullmetal Al

## 2) Найпопулярніші анімаційні серіали по рокам

In [15]:
filtered_basics_df = (basics_df
                  .filter(basics_df.startYear >= 2000)
                  .filter(array_contains(basics_df.genres, 'Animation'))
                  .filter(basics_df.titleType == 'tvSeries')
                  .join(ratings_df, ratings_df.tconst == basics_df.tconst)
                  .filter(ratings_df.numVotes > 30000)
                  .filter(basics_df.runtimeMinutes < 30)
                  .select('primaryTitle', 'startYear', 'averageRating', 'numVotes'))


window_spec = Window.partitionBy("startYear").orderBy(col("averageRating").desc())
ranked_df = filtered_basics_df.withColumn("rank", rank().over(window_spec))

most_popular_animeseries_yearly_df = ranked_df.filter(col("rank") == 1)
most_popular_animeseries_yearly_df.show()

+--------------------+---------+-------------+--------+----+
|        primaryTitle|startYear|averageRating|numVotes|rank|
+--------------------+---------+-------------+--------+----+
|      Justice League|     2001|          8.6|   50429|   1|
|              Naruto|     2002|          8.4|  120987|   1|
| Fullmetal Alchemist|     2003|          8.5|   75820|   1|
|             Monster|     2004|          8.7|   40816|   1|
|Avatar: The Last ...|     2005|          9.3|  348406|   1|
|          Code Geass|     2006|          8.7|   77162|   1|
|   Naruto: Shippuden|     2007|          8.7|  151556|   1|
|Star Wars: The Cl...|     2008|          8.4|  115725|   1|
|Fullmetal Alchemi...|     2009|          9.1|  190417|   1|
|      Adventure Time|     2010|          8.6|  108097|   1|
|       Young Justice|     2010|          8.6|   44493|   1|
|     Hunter x Hunter|     2011|          9.0|  125032|   1|
|       Gravity Falls|     2012|          8.9|  128330|   1|
|     Attack on Titan|  

## 3) Фільми, що дубльовані українською, але не дубльовані російською

In [9]:
filtered_movies = (
    akas_df
    .join(basics_df, akas_df.titleId == basics_df.tconst, 'inner')
    .filter(basics_df.titleType == 'movie')
    .filter(akas_df.language == 'uk')
    .filter(~((col("language") == 'ru') & (col("region") == 'RU')))
    .limit(20)
)

# Show the result
filtered_movies.show()

+---------+--------+-------------------+------+--------+-------------+--------------------+---------------+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|  titleId|ordering|              title|region|language|        types|          attributes|isOriginalTitle|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+--------+-------------------+------+--------+-------------+--------------------+---------------+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0021035|       5|Квартали передмістя|  SUHH|      uk|[imdbDisplay]|                [\N]|          false|tt0021035|    movie|  Suburban Districts|Kvartaly peredmistia|   NULL|     1930|   NULL|            60|                [\N]|
|tt0130223|       5|         Ridna krov|  SUHH|      uk|         [\N]|[t

## 4) 25 Найдовших аніме

In [10]:
window = Window.orderBy(desc('totalMinutes'))

anime_with_ratings_and_episodes = (
    basics_df
    .filter(basics_df.startYear >= 2000)
    .filter(array_contains(basics_df.genres, 'Animation'))
    .filter(basics_df.titleType == 'tvSeries')
    .filter(basics_df.runtimeMinutes < 30)
    .join(
        ratings_df,
        ratings_df.tconst == basics_df.tconst,
        'inner'
    )
    .filter(ratings_df.numVotes > 30000)
    .join(
        episode_df,
        episode_df.parentTconst == basics_df.tconst,
        'left_outer'
    )
    .groupBy(basics_df.tconst, basics_df.primaryTitle, episode_df.seasonNumber, basics_df.runtimeMinutes)
        .agg({'episodeNumber': 'max'})
        .withColumnRenamed('max(episodeNumber)', 'maxEpisodeNumber')
    .groupBy(basics_df.tconst, basics_df.primaryTitle, basics_df.runtimeMinutes)
        .agg({'maxEpisodeNumber': 'sum'})
        .withColumnRenamed('sum(maxEpisodeNumber)', 'totalEpisodes')
    .withColumn('totalMinutes', col('totalEpisodes') * col('runtimeMinutes'))
    .orderBy(col('totalMinutes').desc())
    .limit(17)
    .withColumn('rank', row_number().over(window))
)

anime_with_ratings_and_episodes.show()

+---------+--------------------+--------------+-------------+------------+----+
|   tconst|        primaryTitle|runtimeMinutes|totalEpisodes|totalMinutes|rank|
+---------+--------------------+--------------+-------------+------------+----+
|tt0988824|   Naruto: Shippuden|            24|          500|       12000|   1|
|tt0434665|              Bleach|            24|          383|        9192|   2|
|tt0397306|       American Dad!|            22|          366|        8052|   3|
|tt1561755|       Bob's Burgers|            22|          271|        5962|   4|
|tt1710308|        Regular Show|            23|          246|        5658|   5|
|tt0409591|              Naruto|            24|          220|        5280|   6|
|tt2359704|JoJo's Bizarre Ad...|            24|          190|        4560|   7|
|tt7441658|        Black Clover|            24|          170|        4080|   8|
|tt2098220|     Hunter x Hunter|            24|          148|        3552|   9|
|tt5626028|    My Hero Academia|        

## 5) Аніме, top-5 продубльовані українською

In [56]:
animes_uk_dub = (basics_df
                  .filter(array_contains(basics_df.genres, 'Animation'))
                  .filter(basics_df.titleType == 'tvSeries')
                  .join(ratings_df, ratings_df.tconst == basics_df.tconst)
                  .filter(ratings_df.numVotes > 10)
                  .join(akas_df, akas_df.titleId == basics_df.tconst, 'inner')
                  .filter(akas_df.language == 'uk')
                  .orderBy(col('averageRating').desc())
                  .select('titleType', 'primaryTitle', 'originalTitle', 'startYear', 'title')
                  .limit(5))

animes_uk_dub.show()

+---------+--------------------+--------------------+---------+--------------------+
|titleType|        primaryTitle|       originalTitle|startYear|               title|
+---------+--------------------+--------------------+---------+--------------------+
| tvSeries|        The Simpsons|        The Simpsons|     1989|            Сімпсони|
| tvSeries|            Doraemon|            Doraemon|     1979|            Дораемон|
| tvSeries|            Doraemon|            Doraemon|     1973|            Дораемон|
| tvSeries|The Tom and Jerry...|The New Adventure...|     1980|Том і Джеррі. Ком...|
| tvSeries|The Adventures of...|Priklyucheniya Ne...|     1971|Пригоди Незнайка ...|
+---------+--------------------+--------------------+---------+--------------------+


## 6) Топ-4 фільми з участю Біллі Херрінгтона

In [9]:
billy_df = (name_df
                   .filter(name_df.primaryName == 'Billy Herrington')
                   .select('nconst', 'primaryName', 'birthYear', explode('knownForTitles').alias('filmId'))
                   .join(basics_df, basics_df.tconst == col('filmId'))
                   .join(ratings_df, ratings_df.tconst == basics_df.tconst)
                   .sort(desc(col('averageRating')))
                   .select('primaryName', 'primaryTitle', 'startYear', 'genres', 'averageRating')
                   .orderBy(desc('averageRating'))
                   .limit(4))

billy_df.show()

+----------------+--------------------+---------+--------------------+-------------+
|     primaryName|        primaryTitle|startYear|              genres|averageRating|
+----------------+--------------------+---------+--------------------+-------------+
|Billy Herrington|Playing with Fire...|     2000|[Adult, Crime, Dr...|          9.9|
|Billy Herrington|           Body Shop|     1999|             [Adult]|          8.9|
|Billy Herrington|           Conquered|     2001|             [Adult]|          8.4|
|Billy Herrington|     HotMen CoolBoyz|     2000|             [Adult]|          6.9|
+----------------+--------------------+---------+--------------------+-------------+
