## Bibliotecas

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

In [2]:
import findspark

findspark.init()

In [3]:
spark = (
    SparkSession.builder
    .config('spark.serializer', "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [4]:
imdb_path = '../data/imdb/'

In [13]:
df_titles = spark.read.format("parquet").load(imdb_path + 'title_basics')
df_ratings = spark.read.format("parquet").load(imdb_path + 'title_ratings')

## Questão 6

In [7]:
df_titles.limit(5).toPandas()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [8]:
df_titles.select('titleType').distinct().toPandas()

Unnamed: 0,titleType
0,tvSeries
1,tvMiniSeries
2,tvMovie
3,tvPilot
4,tvEpisode
5,movie
6,tvSpecial
7,video
8,videoGame
9,tvShort


In [9]:
df_titles.filter('titleType = "short" and startYear = "2015"').count()

45948

In [14]:
df_titles.filter('titleType = "short" and startYear = "2015"').count()

45948

## Questão 11

### Forma Errada

In [44]:
df_titles.limit(5).toPandas()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [43]:
(
    df_titles
    .withColumn('genres', f.explode(f.split('genres', ',')))
    .filter("startYear = 2018")
    .groupby('startYear', 'genres')
    .count()
    .withColumn('total', f.sum('count').over(Window.partitionBy('startYear')))
    .withColumn('perc', f.col('count')/f.col('total'))
    .toPandas()
)

Unnamed: 0,startYear,genres,count,total,perc
0,2018,Action,15575,624861,0.024926
1,2018,Talk-Show,47492,624861,0.076004
2,2018,Adult,14749,624861,0.023604
3,2018,Horror,9619,624861,0.015394
4,2018,Sport,10332,624861,0.016535
5,2018,Comedy,78809,624861,0.126122
6,2018,Reality-TV,27601,624861,0.044171
7,2018,Drama,89367,624861,0.143019
8,2018,Thriller,8154,624861,0.013049
9,2018,Animation,17948,624861,0.028723


### Forma Correta

In [45]:
(
    df_titles
    .withColumn('n_titles', f.count(f.lit(1)).over(Window.partitionBy('startYear')))
    .withColumn('genres', f.explode(f.split('genres', ',')))
    .filter("startYear = 2018")
    .groupby('startYear', 'genres', 'n_titles')
    .count()
    .withColumn('perc', f.col('count')/f.col('n_titles'))
    .toPandas()
)

Unnamed: 0,startYear,genres,n_titles,count,perc
0,2018,Drama,402244,89367,0.222171
1,2018,Comedy,402244,78809,0.195923
2,2018,\N,402244,41740,0.103768
3,2018,Documentary,402244,37840,0.094072
4,2018,Short,402244,66482,0.165278
5,2018,Action,402244,15575,0.03872
6,2018,Family,402244,16997,0.042255
7,2018,Sci-Fi,402244,5644,0.014031
8,2018,Thriller,402244,8154,0.020271
9,2018,Romance,402244,25002,0.062156


## Questão 14

In [22]:
type(5**2)

int

In [25]:
@f.udf(returnType = t.IntegerType())
def square_udf(num):
    if num:
        return num**2
    else:
        return None

In [28]:
def square(num):
    if num:
        return num**2
    else:
        return None
square_udf = f.udf(square)

In [26]:
(
    df_titles
    .withColumn('runtimeMinutes', f.col('runtimeMinutes').cast('int'))
    .withColumn('runtime_sqr', square_udf(f.col('runtimeMinutes')))
    .limit(5)
    .toPandas()
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,runtime_sqr
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",1
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",25
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",16
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",144
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",1


In [32]:
df_int = (
    df_titles.withColumn('genres_array', f.split('genres', ','))
    .withColumn('genres_explode', f.explode(f.col('genres_array')))
)

In [33]:
possible_genres = [c['genres_explode'] for c in df_int.select('genres_explode').distinct().collect()]

In [34]:
possible_genres

['Crime',
 'Romance',
 'Thriller',
 'Adventure',
 '\\N',
 'Drama',
 'War',
 'Documentary',
 'Reality-TV',
 'Family',
 'Fantasy',
 'Game-Show',
 'Adult',
 'History',
 'Mystery',
 'Musical',
 'Animation',
 'Music',
 'Film-Noir',
 'Short',
 'Horror',
 'Western',
 'Biography',
 'Comedy',
 'Sport',
 'Action',
 'Talk-Show',
 'Sci-Fi',
 'News']

In [40]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [41]:
(
    df_titles
    .withColumn('genres_array', f.split('genres', ','))
    .withColumn('genres_explode', f.explode(f.col('genres_array')))
    .groupby('tconst', 'primaryTitle')
    .pivot('genres_explode', possible_genres)
    .count()
    .fillna(0, subset=possible_genres)
    .agg(*[f.sum(c).alias(c) for c in possible_genres])
    .limit(5)
    .toPandas()
)

Unnamed: 0,Crime,Romance,Thriller,Adventure,\N,Drama,War,Documentary,Reality-TV,Family,Fantasy,Game-Show,Adult,History,Mystery,Musical,Animation,Music,Film-Noir,Short,Horror,Western,Biography,Comedy,Sport,Action,Talk-Show,Sci-Fi,News
0,351447,724729,134054,324325,643012,2247995,29827,764885,423455,571470,174119,252533,242704,114975,162448,63122,406284,394008,763,1021850,146400,27912,87738,1653725,178594,334580,900198,96515,524662


In [41]:
(
    df_titles
    .withColumn('genres_array', f.split('genres', ','))
    .withColumn('genres_explode', f.explode(f.col('genres_array')))
    .groupby('tconst', 'primaryTitle')
    .pivot('genres_explode', possible_genres)
    .count()
    .fillna(0, subset=possible_genres)
    .agg(*[f.sum(c).alias(c) for c in possible_genres])
    .limit(5)
    .toPandas()
)

Unnamed: 0,Crime,Romance,Thriller,Adventure,\N,Drama,War,Documentary,Reality-TV,Family,Fantasy,Game-Show,Adult,History,Mystery,Musical,Animation,Music,Film-Noir,Short,Horror,Western,Biography,Comedy,Sport,Action,Talk-Show,Sci-Fi,News
0,351447,724729,134054,324325,643012,2247995,29827,764885,423455,571470,174119,252533,242704,114975,162448,63122,406284,394008,763,1021850,146400,27912,87738,1653725,178594,334580,900198,96515,524662
