In [1]:
import sys
import os

sys.path.insert(0, os.path.dirname(os.getcwd()))

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, rand, split, explode, regexp_replace, corr

from schemas.dataframes import get_episode_df, get_basics_df, get_akas_df, get_crew_df, get_principals_df, get_ratings_df, get_name_df

C:\Users\Dima\Desktop\Навчання\7 семестр\Біг дата\big-data-project


In [2]:
spark_session = (SparkSession.builder
                             .master('local')
                             .appName('test app')
                             .config(conf=SparkConf())
                             .getOrCreate())

In [3]:
title_episode = get_episode_df(spark_session)

title_crew = get_crew_df (spark_session)
 
title_principals = get_principals_df (spark_session)

title_name = get_name_df(spark_session)

title_basic = get_basics_df(spark_session)

title_akas = get_akas_df(spark_session)

rating = get_ratings_df(spark_session)

№1: Актори з кількістю ролей в фільмах для дорослих

In [4]:
from pyspark.sql.functions import count, array_contains, desc

# Приєднання title_principals із title_basic
adult_actors = (
    title_principals
    .join(title_basic, title_principals['tconst'] == title_basic['tconst'], 'inner')
    .filter((array_contains(title_basic['genres'], 'Adult')) & (title_principals["category"] == "actor"))
    .groupBy(title_principals['nconst'])
    .agg(count(title_principals['tconst']).alias("numAdultFilms"))
    .orderBy(desc("numAdultFilms"))
)

# Отримання імен акторів з title_name за допомогою nconst
result = (
    adult_actors
    .join(title_name, adult_actors['nconst'] == title_name['nconst'], 'inner')
    .select(adult_actors['numAdultFilms'], title_name['primaryName'])
)

result.show()


+-------------+-----------------+
|numAdultFilms|      primaryName|
+-------------+-----------------+
|           33|   David Ashfield|
|           13|     David Cannon|
|            1|Michael Carpenter|
|           10|       Dan Cooper|
|            3|       Nic Cramer|
|            1|  Jackson Dedeaux|
|            1|      Arem Fisher|
|            6|      Jeremy Iron|
|            8|    Jeremy Joshua|
|            1|   Stephen Lester|
|           16|      David Luger|
|           10|      Nick Marino|
|           25|   Johnny Mercury|
|            1|     Brad Philips|
|            6| Christopher Rage|
|            2|     Steven Ryder|
|          421|   Julian St. Jox|
|           12|      Damian Zeus|
|            3|      Charlie Boy|
|            2|     Robert Horne|
+-------------+-----------------+


№2: Фільми для дорослих німецькою мовою

In [5]:
from pyspark.sql.functions import col, rand, array_contains

# Об'єднання даних за допомогою titleId як ключа
joined_df = title_akas.join(title_basic, title_akas['titleId'] == title_basic['tconst'], 'inner')

# Фільтрація для фільмів для дорослих німецькою мовою
adult_german_movies = (
    joined_df
    .filter((array_contains(col('genres'), 'Adult')) & (col('language') == 'de'))
    .orderBy(rand())
    .select("primaryTitle", "originalTitle", "language", "startYear", "genres")
    .limit(10)
)

# Виведення результатів
adult_german_movies.show()

+--------------------+--------------------+--------+---------+-------+
|        primaryTitle|       originalTitle|language|startYear| genres|
+--------------------+--------------------+--------+---------+-------+
|Episode dated 30 ...|Episode dated 30 ...|      de|     2013|[Adult]|
|Episode dated 15 ...|Episode dated 15 ...|      de|     2019|[Adult]|
|Episode dated 23 ...|Episode dated 23 ...|      de|     2013|[Adult]|
|Episode dated 18 ...|Episode dated 18 ...|      de|     2013|[Adult]|
|Episode dated 3 J...|Episode dated 3 J...|      de|     2014|[Adult]|
|Episode dated 20 ...|Episode dated 20 ...|      de|     2015|[Adult]|
|Episode dated 27 ...|Episode dated 27 ...|      de|     2020|[Adult]|
|Episode dated 26 ...|Episode dated 26 ...|      de|     2014|[Adult]|
|Episode dated 5 D...|Episode dated 5 D...|      de|     2014|[Adult]|
|Episode dated 10 ...|Episode dated 10 ...|      de|     2017|[Adult]|
+--------------------+--------------------+--------+---------+-------+


№3: Продубльовані фільми українською в період з 1945 по 1991 рік

In [6]:
ukr_movies = (
    title_akas
    .join(title_basic, title_akas['titleId'] == title_basic['tconst'], 'inner')
    .filter((col("language") == "uk") & (col("startYear") < 1991) & (col("startYear") > 1945))
    .orderBy(rand())
    .select("primaryTitle", "originalTitle", "language", "startYear", "genres")
    .limit(10)
)

ukr_movies.show()

+--------------------+--------------------+--------+---------+--------------------+
|        primaryTitle|       originalTitle|language|startYear|              genres|
+--------------------+--------------------+--------+---------+--------------------+
|Seym vykhodit iz ...|Seym vykhodit iz ...|      uk|     1962|             [Drama]|
| Vremya: moskovskoye| Vremya: moskovskoye|      uk|     1977|             [Drama]|
| U prizrakov v plenu| U prizrakov v plenu|      uk|     1984|             [Drama]|
|Curse of Snakes V...|  Klatwa doliny wezy|      uk|     1988|[Adventure, Sci-F...|
|             Alyonka|             Alyonka|      uk|     1962|            [Comedy]|
|      Love and Doves|     Lyubov i golubi|      uk|     1985|[Comedy, Drama, R...|
|Posilka dlja Marg...|Posilka dlja Marg...|      uk|     1990|             [Short]|
|          Beztalanna|          Beztalanna|      uk|     1966|    [Drama, Romance]|
|          Kievlyanka|          Kievlyanka|      uk|     1958|             [

№4: Які телесеріали з німецькою озвучкою мають найбільшу кількість сезонів?

In [7]:
de_tv_series = (
    title_akas
    .join(title_basic, title_akas['titleId'] == title_basic['tconst'], 'inner')
    .join(rating, title_basic['tconst'] == rating['tconst'])
    .filter((col("language") == "de") & (col("titleType") == "tvSeries"))
    .join(title_episode, title_basic['tconst'] == title_episode['parentTconst'], 'left_outer')
    .groupBy("primaryTitle", "language")
    .agg({"seasonNumber": "max"})
    .withColumnRenamed("max(seasonNumber)", "maxSeasonNumber")
    .orderBy(col("maxSeasonNumber").desc())
)

de_tv_series.show()


+--------------------+--------+---------------+
|        primaryTitle|language|maxSeasonNumber|
+--------------------+--------+---------------+
|Jeux sans frontières|      de|             30|
|The Jimmy Star Sh...|      de|             28|
|Perry Como's Kraf...|      de|             19|
|CSI: Crime Scene ...|      de|             15|
|New York Philharm...|      de|             15|
|Married... with C...|      de|             11|
| Hubert ohne Staller|      de|             11|
|               Druck|      de|              8|
|               Elite|      de|              8|
|      The Loud House|      de|              7|
|Alfred Hitchcock ...|      de|              7|
|Little Mosque on ...|      de|              6|
|Alvinnn!!! And th...|      de|              6|
|      Peaky Blinders|      de|              6|
|           The Nanny|      de|              6|
|   The Dragon Prince|      de|              6|
|             Dynasty|      de|              5|
|Miraculous: Tales...|      de|         

№5: Які режисери мають найбільшу кількість короткометражок?

In [8]:
from pyspark.sql.functions import count, desc

# Приєднання title_crew і title_basic
short_film_directors = (
    title_crew
    .join(title_basic, title_crew['tconst'] == title_basic['tconst'], 'inner')
    .filter((col("titleType") == "short"))
    .groupBy(title_crew['directors'])
    .agg(count(title_crew['tconst']).alias("numShortFilms"))
    .orderBy(desc("numShortFilms"))
)

# Отримання імен режисерів з title_name за допомогою nconst
result = (
    short_film_directors
    .join(title_name, array_contains(short_film_directors['directors'], title_name['nconst']), 'inner')
    .select(short_film_directors['numShortFilms'], title_name['primaryName'])
)

result.show()


+-------------+-------------------+
|numShortFilms|        primaryName|
+-------------+-------------------+
|          105|        Otis Turner|
|           32|        Ray McCarey|
|           13|       Stuart Paton|
|            1|      Eduardo Serra|
|            7|        Stephen Low|
|            4|     Myriam Braniff|
|            1|    Rees A. Savidis|
|            1|      Margaret Dodd|
|            1|  Juan Miguel Lamet|
|            1|Manuel López Yubero|
|            1|          Dror Sabo|
|            3|   Carsten Fiebeler|
|            5|          Jan Nemec|
|            4|          José Bohr|
|           26|  Stjepan Zaninovic|
|            1|   Nicolai Albrecht|
|            3|        Jonas Elmer|
|            1|       Jack Couffer|
|            3|     Alan Cullimore|
|           11|     Grant Crabtree|
+-------------+-------------------+


№6: Топ-10 Актори з найкращою оцінкою фільмів, де вони знімались

In [11]:
# Об'єднайте файли
short_films_ratings = (
    title_akas
    .join(title_basic, title_akas['titleId'] == title_basic['tconst'], 'inner')
    .join(rating, title_basic['tconst'] == rating['tconst'])
    .filter((col("titleType") == "short"))
    .select("primaryTitle", "averageRating")
)

# Знайдіть фільм із найвищим рейтингом серед короткометражок
top_short_film = (
    short_films_ratings
    .orderBy(col("averageRating").desc())
    .limit(10)
)

# Виведіть результат
top_short_film.show()

+-------------------+-------------+
|       primaryTitle|averageRating|
+-------------------+-------------+
|      Edna balgarka|         10.0|
|Closed for Business|         10.0|
|The Last Days of Ki|         10.0|
| A Boy and His Hand|         10.0|
|      Edna balgarka|         10.0|
|Closed for Business|         10.0|
|             Heroes|         10.0|
|         Hot Doggie|         10.0|
|             Heroes|         10.0|
|         Hot Doggie|         10.0|
+-------------------+-------------+
