In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

In [None]:
netflix = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
netflix.head()

In [None]:
netflix.info()

In [None]:
netflix.isnull().sum()/len(netflix)*100

In [None]:
netflix = netflix.drop(['show_id','director','cast', 'description'], axis=1)

In [None]:
netflix = netflix.dropna()

In [None]:
netflix['month_added'] = netflix['date_added'].apply(lambda x : x.lstrip().split(' ')[0])
netflix['year_added'] = netflix['date_added'].apply(lambda x : x.lstrip().split(' ')[-1])
netflix['time'] = netflix['duration'].apply(lambda x : int(x.lstrip().split(' ')[0]))
netflix['unit'] = netflix['duration'].apply(lambda x : x.lstrip().split(' ')[-1])

# EDA

In [None]:
data = netflix.groupby('type').count()[['title']].reset_index()
fig = px.pie(
    data, 
    values='title',
    names='type'
)
fig.update_traces(textinfo='label+percent', textposition='inside')
fig.update_layout(showlegend=False, title='Composition of Data')
fig.show()

In [None]:
title = []
genre = []
types = []

def title_genre (df):
    for i in range(len(df['listed_in'].split(', '))):
        genre.append(df['listed_in'].split(', ')[i])
        title.append(df['title'])
        types.append(df['type'])

netflix.apply(title_genre, axis=1)
genre_table = pd.DataFrame(data = {'Titles':title, 'Genre':genre, 'Type':types})
genre_table

In [None]:
movie_data = genre_table[genre_table['Type']=='Movie'].groupby('Genre').count()[['Titles']].reset_index()

fig = px.bar(
    movie_data,
    x='Genre',
    y='Titles',
    color='Genre',
    color_discrete_sequence =['grey'],
    color_discrete_map={'Comedies': 'CornflowerBlue', 'Dramas':'CornflowerBlue', 'International Movies':'CornflowerBlue'},
    text='Titles'
)

fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(showlegend=False, title='Movie Genre', xaxis=dict(title=None))
fig.show()

In [None]:
tv_show_data = genre_table[genre_table['Type']=='TV Show'].groupby('Genre').count()[['Titles']].reset_index()

fig = px.bar(
    tv_show_data,
    x='Genre',
    y='Titles',
    color='Genre',
    color_discrete_sequence =['grey'],
    color_discrete_map={'TV Comedies': 'Coral', 'TV Dramas':'Coral', 'International TV Shows':'Coral'},
    text='Titles'
)

fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(showlegend=False, title='TV Show Genre', xaxis=dict(title=None))
fig.show()

In [None]:
data = movie_added.pivot("month_added", "year_added", "title").reindex(index = months)

fig = plt.figure(figsize = (8,10))
gs = fig.add_gridspec(4, 1)
ax1 =  fig.add_subplot(gs[0:3,:])
ax2 =  fig.add_subplot(gs[3,:])

sns.heatmap(data, cmap =sns.light_palette("red"), annot=True, fmt ='.0f', ax = ax1, cbar=False)
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)

pal = sns.light_palette("red",n_colors =len(data.columns), reverse = True)
rank = data.sum().argsort().argsort()
sns.barplot(data = data, estimator = sum, dodge = False, ax = ax2, palette = np.array(pal[::-1])[rank])

As we can see every movies added increased significantly.
<br>
And on October 2018 is most movies added.

In [None]:
data = tv_show_added.pivot("month_added", "year_added", "title").reindex(index = months)

fig = plt.figure(figsize = (6,10))
gs = fig.add_gridspec(4, 1)
ax1 =  fig.add_subplot(gs[0:3,:])
ax2 =  fig.add_subplot(gs[3,:])

sns.heatmap(data, cmap =sns.light_palette("orange"), annot = True, fmt ='.0f', ax = ax1, cbar=False)
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)

pal = sns.light_palette("orange",n_colors =len(data.columns), reverse = True)
rank = data.sum().argsort().argsort()
sns.barplot(data = data,  estimator = sum, dodge = False, ax = ax2, palette=np.array(pal[::-1])[rank])

In [None]:
genre_table = genre_table.groupby(['Type','Genre']).count().reset_index()
movie_genre = genre_table[genre_table['Type'] == 'Movie'].sort_values('Titles', ascending = False)
tv_show_genre = genre_table[genre_table['Type'] == 'TV Show'].sort_values('Titles', ascending = False)

In [None]:
fig = px.pie(movie_genre, values='Titles', 
             names='Genre', 
             title="Movie Genre", 
             width = 750)

fig.update_traces(textposition='inside', 
                  textinfo='percent+label')

fig.update_layout(uniformtext_minsize=8, 
                  uniformtext_mode='hide')

fig.show()

In [None]:
fig = px.pie(tv_show_genre, 
             values='Titles', 
             names='Genre', 
             title="TV Show Genre", 
             width = 800)

fig.update_traces(textposition='inside', 
                  textinfo='percent+label')

fig.update_layout(uniformtext_minsize=8, 
                  uniformtext_mode='hide')

fig.show()

In [None]:
rating = netflix.groupby(['type','rating']).count()['title'].reset_index()

In [None]:
rating

In [None]:
fig = px.sunburst(rating, 
                  path =['type','rating'], 
                  values='title', 
                  width = 600, 
                  title = "Rating")

fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

This time, I used sunburst, So we can see which rating that have the most.
<br>
Even we don't seperate them into Movies and TV Show.

Go to another topic, That's Country.

In [None]:
netflix.groupby('country').count()['title']

On country column, we have same issue as listed_in(genre) column.
<br>
We just need make another table for country.

In [None]:
title = []
country = []
types = []

def title_country (df):
    for i in range(len(df['country'].split(', '))):
        country.append(df['country'].split(', ')[i])
        title.append(df['title'])
        types.append(df['type'])

In [None]:
netflix.apply(title_country, axis=1)

In [None]:
country_table = pd.DataFrame(data = {'Titles':title, 'Country':country, 'Type':types})
country_table

That's it, we have one country for each title.
<br>
Our point is which country that contribute most Movies and TV Shows in Neflix.

In [None]:
country_counted = country_table.groupby(['Type','Country']).count().reset_index()
country_counted

First we count titles for each type and country.
<br>
Next, we create 2 columns Movie and TV Show. So we can know how many Movie and TV Show for each country.

In [None]:
country_counted = country_counted.pivot(index = 'Country', columns='Type', values ='Titles').reset_index()
country_counted

In [None]:
country_counted = country_counted.fillna(value = 0)
country_counted['Total']= country_counted['Movie']+country_counted['TV Show']
country_counted

Then we create Total column, by adding Movie column and TV Show column.

In [None]:
Top_22_country = country_counted.sort_values('Total', ascending = False).head(22)
Top_22_country

And then we take top 22.
<br>
Why 22? Because my country on that rank.. Yep that's INDONESIA.

In [None]:
f, ax = plt.subplots(figsize = (14,8))

sns.barplot(x="Total", y='Country',
            data=Top_22_country,
            label="TV Show", 
            color="g")

sns.barplot(x="Movie", y='Country',
            data=Top_22_country,
            label="Movie", 
            color="Turquoise")

ax.legend(ncol=2, loc="lower right", frameon=True)

ax.set(ylabel="",
       xlabel="", 
       title = "Top 22 Country")
sns.despine(left=True, bottom=True)

Finally we make fancy visualization.
<br>
As we can see USA has great amount of TV Show and Movie.

Last one, we will distribution value of duration.
<br>
This one also easy, we just separate unit and time.
<br>
Actually for Movies they use mins, and TV Show they use Seasons.

In [None]:
netflix['time'] = netflix['duration'].apply(lambda x : int(x.lstrip().split(' ')[0]))
netflix['unit'] = netflix['duration'].apply(lambda x : x.lstrip().split(' ')[-1])

In [None]:
fig = px.histogram(netflix[netflix['type']=='Movie'], 
                   x="time", marginal="box", 
                   nbins=40, 
                   color_discrete_sequence=['SlateGrey'], 
                   title= "Movie's Duration")
fig.show()

That's the distribution, around 90-99 mins is the most.
<br>
And also there is one movies with 312 mins, that's so loooonnnng.

In [None]:
netflix[netflix['duration'] == '312 min']

Anyone knows that movie?
<br>
I never watch that one, interesting..

In [None]:
f, ax = plt.subplots(figsize=(8,6))
sns.countplot(x='time', data = netflix[netflix['type']=='TV Show'], palette ='pastel')
ax.set(xlabel = 'Seasons',title="TV Show's Seasons")

And for TV Show, OMG Why they made 15 seasons?!
<br>
Let me check that TV Show

In [None]:
netflix[netflix['duration'] == '16 Seasons']

OK those 2 TV Show are good.
<br>
I will added them into my watchlist.

Alright, it's done for now.
<br>
Thank you for your attention looking for my notebook.
<br>
I'm so sorry for the terrible description. I hope you can understand my visualization.
<br>
Feel free to comment anything about this notebook.
<br>
Thank you very much.