In [55]:
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark.sql import Window

In [56]:
df_titles = spark.read.csv('title_basics.tsv', header=True, sep='\t')
df_ratings = spark.read.csv('title_ratings.tsv', header=True, sep='\t')

In [3]:
df_titles.dtypes

[('tconst', 'string'),
 ('titleType', 'string'),
 ('primaryTitle', 'string'),
 ('originalTitle', 'string'),
 ('isAdult', 'string'),
 ('startYear', 'string'),
 ('endYear', 'string'),
 ('runtimeMinutes', 'string'),
 ('genres', 'string')]

In [17]:
(df_titles
 .select('titleType')
 .filter((f.col('titleType').isin('tvMovie', 'movie')) 
         & (f.col('startYear') == '2015'))
 .count())

19987

In [3]:
df_titles.select('genres').limit(5).toPandas()

Unnamed: 0,genres
0,"Documentary,Short"
1,"Animation,Short"
2,"Animation,Comedy,Romance"
3,"Animation,Short"
4,"Comedy,Short"


In [10]:
df_titles_genres = (df_titles
 .withColumn('genre', f.explode(f.split(f.col('genres'), ',')))
 .groupBy(f.col('genre')).agg(f.count('genre').alias('qtd'))
 .select('genre', 'qtd')
 .orderBy(f.col('qtd').desc())
 .toPandas())

Unnamed: 0,genre,qtd
0,Drama,2247995
1,Comedy,1653725
2,Short,1021850
3,Talk-Show,900198
4,Documentary,764885
5,Romance,724729
6,\N,643012
7,Family,571470
8,News,524662
9,Reality-TV,423455


In [12]:
df_ratings.dtypes

[('tconst', 'string'), ('averageRating', 'string'), ('numVotes', 'string')]

In [19]:
df_join = (df_titles.join(df_ratings, df_titles['tconst'] == df_ratings['tconst'])).drop(df_ratings['tconst'])

In [65]:
w = Window.partitionBy('genre').orderBy('PrimaryTitle')

In [66]:
(df_titles
 .filter(f.col('startYear') == '2018')
 .withColumn('genre', f.explode(f.split(f.col('genres'), ',')))
 .withColumn('perc', f.percent_rank().over(w))
 .select('genre', 'perc')
 .filter(f.col('genre') == 'Comedy')
 .toPandas()
)

Unnamed: 0,genre,perc
0,Comedy,0.000000
1,Comedy,0.000013
2,Comedy,0.000025
3,Comedy,0.000038
4,Comedy,0.000051
...,...,...
78804,Comedy,0.999949
78805,Comedy,0.999962
78806,Comedy,0.999975
78807,Comedy,0.999987


In [67]:
(df_titles
 .filter(f.col('startYear') == '2018')
 .withColumn('genre', f.explode(f.split(f.col('genres'), ',')))
 .count()
)

624861

In [60]:
df_titles.withColumn('genre', f.explode(f.split(f.col('genres'), ','))).select('genre').distinct().toPandas()

Unnamed: 0,genre
0,Crime
1,Romance
2,Thriller
3,Adventure
4,\N
5,Drama
6,War
7,Documentary
8,Reality-TV
9,Family


In [39]:
def sqr_divide(value): 

    return (value**2)/2

sqr_divide_udf = f.udf(sqr_divide, IntegerType())

In [40]:
(
   df_ratings
    .withColumn('averageRating', f.col('averageRating').cast('double'))
    .select(sqr_divide_udf('averageRating').alias('averageRating'))
    .agg(f.round(f.mean('averageRating'), 3).alias('averageRating'))
    .toPandas()
)

Unnamed: 0,averageRating
0,
