In [7]:
import sys
import os


from pyspark import SparkConf
from pyspark.sql import SparkSession, Window
import pyspark.sql.types as t
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, rand, split, explode, regexp_replace, corr, desc, row_number, percent_rank

from schemas.dataframes import get_episode_df, get_ratings_df, get_name_df, get_principals_df, get_crew_df, get_akas_df, get_basics_df

In [2]:
spark_session = (SparkSession.builder
                             .master('local')
                             .appName('IMDB')
                             .config(conf=SparkConf())
                             .getOrCreate())

In [3]:
episode_df = get_episode_df(spark_session)
ratings_df = get_ratings_df(spark_session)
name_df = get_name_df(spark_session)
principals_df = get_principals_df(spark_session)
crew_df = get_crew_df(spark_session)
akas_df = get_akas_df(spark_session)
basics_df = get_basics_df(spark_session)

## 1) 10 фільмів українською мовою з найвищим рейтингом

In [None]:
ukraine_akas_df = (akas_df
                   .filter(akas_df.language == 'uk')
                   .join(ratings_df, ratings_df.tconst == akas_df.titleId)
                   .filter(ratings_df.numVotes > 100)
                   .sort(col('averageRating').desc())
                   .limit(20)
                   )

ukraine_akas_df.show()


## 2) Топ 20 фільмів із найбільшою кількістю локальних назв

In [None]:
top_20_akas_df = (akas_df
                  .dropDuplicates(['title'])
                  .groupBy('titleId').count()
                  .sort(desc(col('count')))
                  #.dropDuplicates(['titleId'])
                  .limit(20)
                  .join(basics_df, basics_df.tconst == col('titleId'))
                  .select('titleId', 'primaryTitle', 'startYear', 'count')
                  )

window = Window.orderBy(desc('count'))
res_df = top_20_akas_df.withColumn('rank', row_number().over(window))

res_df.show(20, False)

## 3) Всі унікальні локальні назви для фільму Home Alone 2: Lost in New York (1992)

In [None]:
pokemon_df = (basics_df
              .filter((basics_df.primaryTitle == 'Home Alone 2: Lost in New York') & (basics_df.startYear == 1992))
              .join(akas_df, akas_df.titleId == basics_df.tconst)
              .dropDuplicates(['title'])
              .select('titleId', 'title', 'region')
              )
pokemon_df.show(100, False)

## 4) Топ 5 найпопулярніших фільмів із раяном гослінгом за весь час, сортування по рейтингу

In [None]:
window = Window.orderBy(desc('averageRating'))

ryan_gosling_df = (name_df
                   .filter(name_df.primaryName == 'Ryan Gosling')
                   .select('nconst', 'primaryName', 'birthYear', explode('knownForTitles').alias('filmId'))
                   .join(basics_df, basics_df.tconst == col('filmId'))
                   .join(ratings_df, ratings_df.tconst == basics_df.tconst)
                   .sort(desc(col('averageRating')))
                   .select('primaryName', 'primaryTitle', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes')
                   .orderBy(desc('averageRating'))
                   .limit(5)
                   )

ryan_gosling_df = ryan_gosling_df.withColumn('rank', row_number().over(window))
ryan_gosling_df.show()

## 5) Процентне співвідношення кількості фільмів по роках починаючи із 2000 року

In [17]:
window1 = Window.orderBy('startYear')
window2 = Window.orderBy(desc('percent'))

films_by_year_df = (basics_df
                    .filter((basics_df.startYear >= 2000) & (basics_df.startYear <= 2024) 
                            & ((basics_df.titleType == 'movie') | (basics_df.titleType == 'tvMovie')))
                    .groupBy('startYear').count()
                    )
                
res = films_by_year_df.withColumn('percent', percent_rank().over(window1))
res.show(200, False)

+---------+-----+--------------------+
|startYear|count|percent             |
+---------+-----+--------------------+
|2000     |7662 |0.0                 |
|2001     |8231 |0.041666666666666664|
|2002     |8537 |0.08333333333333333 |
|2003     |9045 |0.125               |
|2004     |9747 |0.16666666666666666 |
|2005     |11009|0.20833333333333334 |
|2006     |11603|0.25                |
|2007     |12585|0.2916666666666667  |
|2008     |13630|0.3333333333333333  |
|2009     |15147|0.375               |
|2010     |15754|0.4166666666666667  |
|2011     |16998|0.4583333333333333  |
|2012     |18104|0.5                 |
|2013     |18657|0.5416666666666666  |
|2014     |19860|0.5833333333333334  |
|2015     |20814|0.625               |
|2016     |22253|0.6666666666666666  |
|2017     |22429|0.7083333333333334  |
|2018     |22363|0.75                |
|2019     |21986|0.7916666666666666  |
|2020     |18697|0.8333333333333334  |
|2021     |20936|0.875               |
|2022     |21846|0.916666

## 6) Всі серії футурами по сезонах

In [47]:
futurama_df = (episode_df
                .filter(episode_df.parentTconst == 
                        basics_df
                           .filter((basics_df.primaryTitle == 'Futurama') & (basics_df.titleType == 'tvSeries') & (basics_df.startYear == 1999))
                           .select('tconst').collect()[0][0]
                        )
                .join(ratings_df.withColumnRenamed('tconst', 'rtconst'), 
                      col('rtconst') == episode_df.tconst)
                .join(basics_df.withColumnRenamed('tconst', 'btconst'),
                      col('btconst') == episode_df.tconst)
                .orderBy('seasonNumber', 'episodeNumber')
                .select('tconst', 'seasonNumber', 'episodeNumber', 'primaryTitle', 'startYear', 'averageRating', 'numVotes')
                )

futurama_df.show(1000, False)

+----------+------------+-------------+----------------------------------------+---------+-------------+--------+
|tconst    |seasonNumber|episodeNumber|primaryTitle                            |startYear|averageRating|numVotes|
+----------+------------+-------------+----------------------------------------+---------+-------------+--------+
|tt0584449 |1           |1            |Space Pilot 3000                        |1999     |8.6          |4816    |
|tt0756891 |1           |2            |The Series Has Landed                   |1999     |8.0          |3871    |
|tt0756882 |1           |3            |I, Roommate                             |1999     |8.2          |3765    |
|tt0756885 |1           |4            |Love's Labours Lost in Space            |1999     |8.1          |3593    |
|tt0584438 |1           |5            |Fear of a Bot Planet                    |1999     |7.8          |3385    |
|tt0584425 |1           |6            |A Fishful of Dollars                    |1999    