In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as ps
from plotly.subplots import make_subplots


df = pd.read_csv('data/netflix_titles.csv', index_col='show_id')

In [None]:
df

In [None]:
movie_genres = []
for s in df[df["type"]=='Movie']["listed_in"]:
    movie_genres += s.split(', ')
movie_genres = list(set(list(movie_genres)))
movie_genres

In [None]:
import math

def getDurationInMinutes(duration_string):
    if isinstance(duration_string,str):
        return int(duration_string.split(' ')[0])
    return 0

average_movie_duration_per_genre = {genre:0 for genre in movie_genres}
for genre in movie_genres:
    durations = df[df["listed_in"].str.contains(genre)]["duration"].values.tolist()
    durations = [getDurationInMinutes(s) for s in durations if isinstance(s,str)]
    average_movie_duration_per_genre[genre] = np.mean(durations)
average_movie_duration_per_genre

In [None]:
index = [i for i in range(0,len(average_movie_duration_per_genre))]
genres_average_duration_df = pd.DataFrame.from_dict(average_movie_duration_per_genre,orient='index')
genres_average_duration_df['i'] = index
genres_average_duration_df = genres_average_duration_df.reset_index().set_index('i')
genres_average_duration_df.columns = ['genre','average_duration']
m_i = genres_average_duration_df[genres_average_duration_df["genre"]=='Movies'].index
genres_average_duration_df.drop(m_i[0],inplace=True)
genres_average_duration_df

In [None]:
px.histogram(genres_average_duration_df,y="genre",x="average_duration").update_yaxes(categoryorder='total ascending').update_layout(xaxis_title="Average duration (min)", yaxis_title="Movie genres")

In [None]:
directors = []
for s in df["director"]:
    if isinstance(s,str):
        directors += s.split(', ')
unique_directors = list(set(list(directors)))
number_of_occurrences_per_director = {director:0 for director in unique_directors}
for director in unique_directors:
    number_of_occurrences_per_director[director] = directors.count(director)
number_of_occurrences_per_director = {k: v for k, v in sorted(number_of_occurrences_per_director.items(), key=lambda item: item[1],reverse=True)}
number_of_occurrences_per_director

In [None]:
directors_df = pd.DataFrame(number_of_occurrences_per_director.items(), columns=["Name","Number of movies/TV series"])
top_10_directors_df = directors_df[:10]
top_10_directors_df

In [None]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(top_10_directors_df.columns),
                fill_color='paleturquoise',
                align='center'),
    cells=dict(values=top_10_directors_df.transpose().values.tolist(),
               fill_color='lavender',
               align='center'))
])
fig.show()

In [None]:
movie_duration_by_release_year_df = df[df["type"]=='Movie'][["release_year","duration"]]
movie_duration_by_release_year_df["duration"] = [getDurationInMinutes(s) for s in df[df["type"]=='Movie']["duration"]]
#np.mean(movie_duration_by_release_year_df[movie_duration_by_release_year_df["release_year"]==2017]["duration"])
movie_duration_by_release_year_df

In [None]:
list_of_countries = []
for country in df["country"]:
    if isinstance(country,str):
        list_of_countries += country.split(', ')
list_of_countries = list(set(list_of_countries))

In [None]:
countries_by_number_of_productions = {country:len(df[df["country"].str.contains(country).fillna(False)]) for country in list_of_countries}
len(df)

In [None]:
import geopandas as gp
import geodatasets as gd

url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
world = gp.read_file(url)

In [None]:
countries_by_number_of_productions_df = pd.DataFrame(countries_by_number_of_productions.items(),columns=["SOVEREIGNT","productions_number"])
countries_by_number_of_productions_df.at[43,"SOVEREIGNT"] = "United States of America"
#gdf = countries_by_number_of_productions_df.merge(world,how="right",on="SOVEREIGNT")
gdf = world.merge(countries_by_number_of_productions_df,how="left",on="SOVEREIGNT")

In [None]:
fig = px.choropleth(gdf,geojson=gdf.geometry,locations=gdf.index, color="productions_number",height=500,width=1500, range_color=(0,1000))
fig.update_geos(fitbounds="locations", visible=True)
fig.update_layout(title_text = 'Number of movies/TV series produced by country',title_x = 0.5)
fig.show()

In [None]:
#countries_by_number_of_productions_df.loc[98]
gdf[gdf["SOVEREIGNT"]=='United States of America']

In [None]:
countries_by_number_of_productions_df[countries_by_number_of_productions_df["SOVEREIGNT"]=='United States']