### Import

In [1]:
import sys
import os
import pyspark.sql.types as t
sys.path.insert(0, os.path.dirname(os.getcwd()))

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, rand, split, explode, regexp_replace, corr, desc, countDistinct, row_number, mean, count, array_contains, size

from schemas.dataframes import get_episode_df, get_basics_df, get_akas_df, get_crew_df, get_principals_df, get_ratings_df, get_name_df, project_dir

/app/big-data-project/pukhta
/app/big-data-project
/app


In [2]:
spark_session = (SparkSession.builder
                             .master('local')
                             .appName('test app')
                             .config(conf=SparkConf())
                             .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/03 14:59:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
title_episode = get_episode_df(spark_session)

title_basic = get_basics_df(spark_session)

title_akas = get_akas_df(spark_session)

rating = get_ratings_df(spark_session)

crew = get_crew_df(spark_session)

name = get_name_df(spark_session)

principals = get_principals_df(spark_session)

### Query 1 Фільми де рІжесер був сценаристом

In [None]:
films = (
        crew.alias('c').filter(col("c.directors") == col("c.writers"))
        .na.drop()
        .limit(100)
        .join(title_basic, crew['tconst'] == title_basic['tconst'])
        .join(name, crew['directors'][0] == name['nconst'])
        .select('originalTitle', 'primaryName')
        .limit(10)
)


films.show()



+--------------------+--------------------+
|       originalTitle|         primaryName|
+--------------------+--------------------+
|          Miss Jerry|     Alexander Black|
|     The Magic Sword|     Walter R. Booth|
|The Story of the ...|        Charles Tait|
|Attack on a China...|    James Williamson|
| Histoire d'un crime|     Ferdinand Zecca|
|   L'enfant prodigue|        Michel Carré|
|An Awful Skate; o...|Gilbert M. 'Bronc...|
|Los guapos del pa...|   Segundo de Chomón|
|      Se da de comer|   Segundo de Chomón|
| Los sitios de Chile|   Segundo de Chomón|
|La muerte del tirano|       Narciso Cuyàs|
|     An Awful Moment|       D.W. Griffith|
| Balked at the Altar|       D.W. Griffith|
|The Bandit's Wate...|       D.W. Griffith|
|   Behind the Scenes|       D.W. Griffith|
|Betrayed by a Han...|       D.W. Griffith|
|A Calamitous Elop...|       D.W. Griffith|
|The Call of the Wild|       D.W. Griffith|
|     Riña en un café|   Fructuós Gelabert|
|             Dorotea|   Fructuó

                                                                                

### Query 2 Найдовший Гамериканський серіал

In [15]:
merged_data = (
    title_akas.alias('a').filter((col("a.region") == 'US') & (col("a.isOriginalTitle") == 1))
    .join(title_episode.alias('e'), title_akas["titleId"] == title_episode["parentTconst"])
    .select(col("a.title"), col("e.seasonNumber"))
    .orderBy(col("e.seasonNumber").desc())
    .limit(1)
)
                           
merged_data.show()

[Stage 49:>                                                         (0 + 1) / 1]

+--------------------+------------+
|               title|seasonNumber|
+--------------------+------------+
|The New Price Is ...|          52|
+--------------------+------------+



                                                                                

### Query 3 Фільм з найбільшою кількістю акторів

In [4]:
movie_actors_count = (
    title_basic.filter((title_basic.startYear.isNotNull()) & (title_basic.startYear >= 2000) & ((title_basic.titleType == 'movie') | (title_basic.titleType == 'tvMovie')))\
    .join(principals, (title_basic.tconst == principals.tconst))
    .filter(principals.category == 'actor')
    .groupBy(principals.tconst, title_basic.originalTitle)
    .agg(count(principals.tconst).alias('actor_count'))
    .orderBy(col("actor_count").desc())
    .limit(10)
)

movie_actors_count.show()

23/12/03 15:03:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/12/03 15:03:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/12/03 15:03:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/12/03 15:03:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+----------+--------------------+-----------+
|    tconst|       originalTitle|actor_count|
+----------+--------------------+-----------+
| tt0933766|        The Roommate|         10|
|tt12310790|   Cholo Meat Demons|         10|
| tt0969720|M.O.N.A.Y (Mistey...|         10|
|tt10719600|Tomorrowland: 10 ...|         10|
|tt12206566|     500 Great Goals|         10|
|tt10985028|             The End|         10|
|tt10012854|P.O.W. Open Air a...|         10|
|tt11981948|Liverpool FC vs L...|         10|
|tt10453224|     Year of the Dog|         10|
|tt10443410|       Ninja Drones?|         10|
+----------+--------------------+-----------+



                                                                                

### Query 4 Українське фестивальне кіно

In [9]:
ukrainian_movies = (
    title_akas.filter(title_akas.region == 'UA')
    .withColumn("is_festival",   array_contains(title_akas.types, 'festival'))
    .filter( 'is_festival')
    .select(title_akas.title, title_akas.region, "is_festival")
    .limit(10)
)

ukrainian_movies.show()



+--------------------+------+-----------+
|               title|region|is_festival|
+--------------------+------+-----------+
|    Океанська знайда|    UA|       true|
|Голем: як він при...|    UA|       true|
|Бунтівник без при...|    UA|       true|
|Зловмисники невідомі|    UA|       true|
|Дехто любить гаря...|    UA|       true|
|       Крізь шпарину|    UA|       true|
|  Торік у Марієнбаді|    UA|       true|
|2001 рік: Космічн...|    UA|       true|
|                 Гра|    UA|       true|
|             Якщо...|    UA|       true|
+--------------------+------+-----------+



                                                                                

### Query 5 Середня оцінка фільму по режисеру(сортування за зменшеням рейтингу режисера)

In [45]:
directors_mean_rating = (
    crew.filter(size(crew.directors) == 1)
    .join(rating, crew.tconst == rating.tconst)
    .filter(rating.numVotes > 25000)
    .groupBy(crew.directors)
    .agg(mean(rating.averageRating).alias('mean_rating'))
    .join(name, crew.directors[0] == name.nconst)
    .select(name.primaryName, 'mean_rating')
    .orderBy(col("mean_rating").desc())
    .limit(10)
)
directors_mean_rating.show()



+--------------------+-----------------+
|         primaryName|      mean_rating|
+--------------------+-----------------+
|          Sam Esmail|9.899999618530273|
|         Peter Gould|             9.75|
|        Scott Winant|9.699999809265137|
|      Jonathan Nolan|9.699999809265137|
|       Tetsurô Araki|9.699999809265137|
|      George Mastras|9.699999809265137|
|Konrad Tomaszkiewicz|9.699999809265137|
|        Gordon Smith|9.699999809265137|
|        Nelson Cragg|9.600000381469727|
|      Thomas Schnauz|9.549999952316284|
+--------------------+-----------------+



                                                                                

### Query 6 Фільми/ з Дуэйн Джонсоном які мають рейтинг вище ніж середній по балівуду

In [6]:
bollywood_mean_rating = (
    title_akas
    .filter(title_akas.region == "IN")
    .join(rating, title_akas.titleId == rating.tconst)
    .groupBy(title_akas.region)
    .agg(mean(rating.averageRating).alias('mean_rating'))
    .limit(10)
)


In [None]:
bollywood_mean_rating.show()

In [9]:
bollywood_rating = bollywood_mean_rating.first().mean_rating
bollywood_rating

6.951470009919894

In [8]:

dwayne_best_films =(
          name.filter((name.primaryName == 'Dwayne Johnson') & (name.birthYear == 1972))
              .join(principals, principals.nconst == name.nconst)
              .join(title_basic, title_basic.tconst == principals.tconst)
              .filter((title_basic.titleType == 'movie') | (title_basic.titleType == 'tvMovie'))
              .join(rating, title_basic.tconst == rating.tconst)
              .filter((rating.averageRating > bollywood_rating))
              .select(name.primaryName, title_basic.originalTitle, rating.averageRating)
              .limit(50)
) 

dwayne_best_films.show()

[Stage 19:>                                                         (0 + 1) / 1]

+--------------+--------------------+-------------+
|   primaryName|       originalTitle|averageRating|
+--------------+--------------------+-------------+
|Dwayne Johnson|       Gridiron Gang|          7.1|
|Dwayne Johnson|    Polynesian Power|          8.0|
|Dwayne Johnson|           Fast Five|          7.3|
|Dwayne Johnson|           Furious 6|          7.0|
|Dwayne Johnson|WWE The Rock: The...|          7.0|
|Dwayne Johnson|    Fast & Furious 7|          7.1|
|Dwayne Johnson|Fast & Furious 6 ...|          7.9|
|Dwayne Johnson|               Moana|          7.6|
|Dwayne Johnson|Rock and a Hard P...|          7.3|
|Dwayne Johnson|Fighting with My ...|          7.1|
|Dwayne Johnson|DC League of Supe...|          7.1|
+--------------+--------------------+-------------+



                                                                                