**This whole notebook is heavily inspired by a notebook created by Joshua Swords, link: [ link: https://www.kaggle.com/joshuaswords/netflix-data-visualization](http://). However, I made the graphs with a little bit different codes. I just took inspiration from his notebook on what to show from the dataset.**
**I created this notebook as kind of a cheatsheet and for learning purposes.**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

print('df.shape', df.shape)

df.head(3)

In [None]:
print('\nnull_percent')

for i in df.columns:
    null_percent = df[i].isna().sum() / len(df[i]) * 100
    print(i, null_percent.round(2))

In [None]:
df[['director', 'cast']] = df[['director', 'cast']].fillna('No Data')
df['country'] = df['country'].fillna(df['country'].mode()[0])
df = df.dropna()

print('df.shape', df.shape)

df.isna().sum()

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'])

# Content ratings distribution

In [None]:
rating_order = df['rating'].value_counts().index
count_rating = df.groupby('type')['rating'].value_counts().unstack().fillna(0).astype('int')[rating_order]
count_rating

In [None]:
def remove_border():
    for i in ['top', 'right', 'bottom', 'left']:
        ax.spines[i].set_visible(False)
        
def annotate_bar(series, flip, yoff=0, ya='center', xa='center'):
    for i in series.index:
        ax.annotate(f'{series[i] * flip}', xy=(i, series[i] + yoff), va=ya, ha=xa)

In [None]:
count_rat_mo = count_rating.loc['Movie']
count_rat_tv = count_rating.loc['TV Show'] * -1

fig, ax = plt.subplots(figsize=(12,6))
ax.bar(count_rat_mo.index, count_rat_mo, label='Movie')
ax.bar(count_rat_tv.index, count_rat_tv, label='TV Show')

ax.set_yticks([])
ax.legend(frameon=False)

remove_border()
annotate_bar(count_rat_mo, 1, 20, 'bottom', 'center')
annotate_bar(count_rat_tv, -1, -20, 'top', 'center')

# How contents have been added

In [None]:
df['year_added'] = df['date_added'].dt.year
count_yradd = df.groupby('type')['year_added'].value_counts().unstack().fillna(0).astype('int')
count_yradd

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(count_yradd.loc['Movie'], label='Movie')
ax.plot(count_yradd.loc['TV Show'], label='TV Show')

ax.set_xticks(count_yradd.columns)
ax.yaxis.tick_right()
ax.legend(loc=6, frameon=False)

remove_border()

# Overview of types of content from different countries

As we see, it is possible for a content to have more than one country of origin. Therefore, if a content has more than one country in the 'country' feature, it will be counted in all of those countries.

In [None]:
'''country_list = list()

for i in df['country']:
    i = i.split(',')
    for country in i:
        country = country.strip()
        if country not in country_list and len(country) > 1:
            country_list.append(country)'''

In [None]:
'''country_mo = df['country'][df['type'] == 'Movie']
count_country_mo = list()

for i in country_list:
    count = country_mo.str.contains(i).sum()
    count_country_mo.append([i, count])   

country_tv = df['country'][df['type'] == 'TV Show']
count_country_tv = list()

for i in country_list:
    count = country_tv.str.contains(i).sum()
    count_country_tv.append([i, count])

count_country_mo_df = pd.DataFrame(count_country_mo).rename(columns={0: 'country', 1: 'count'})
count_country_mo_df['type'] = 'Movie'

count_country_tv_df = pd.DataFrame(count_country_tv).rename(columns={0: 'country', 1: 'count'})
count_country_tv_df['type'] = 'TV Show'

count_country = pd.concat([count_country_mo_df,count_country_tv_df])
count_country = count_country.pivot(index='type', columns='country', values='count')

country_order = count_country.sum().sort_values(ascending=False).index

count_country = count_country[country_order]
count_country'''

In [None]:
'''country_mo = df['country'][df['type'] == 'Movie']
count_country_mo = dict()

for i in country_mo:
    i = i.split(',')
    for country in i:
        country = country.strip()
        count_country_mo[country] = count_country_mo.get(country, 0) + 1

country_tv = df['country'][df['type'] == 'TV Show']
count_country_tv = dict()
        
for i in country_tv:
    i = i.split(',')
    for country in i:
        country = country.strip()
        count_country_tv[country] = count_country_tv.get(country, 0) + 1
        
count_country_mo_df = pd.DataFrame(count_country_mo, index=['Movie'])
count_country_tv_df = pd.DataFrame(count_country_tv, index=['TV Show'])
count_country = pd.concat([count_country_mo_df, count_country_tv_df])

country_order = count_country.sum().sort_values(ascending=False).index
count_country = count_country.fillna(0).astype(int)[country_order]
count_country'''

In [None]:
count_country_dict = dict()

for i in df['type'].unique():
    type_dict = dict()
    df_temp = df[df['type'] == i]
    for j in df_temp['country']:
        j = j.split(',')
        for k in j:
            k = k.strip()
            type_dict[k] = type_dict.get(k, 0) + 1
    count_country_dict[i] = type_dict

count_country = pd.DataFrame(count_country_dict).transpose().fillna(0).astype(int).rename_axis(index='type', columns='country')
country_order = count_country.sum().sort_values(ascending=False).index

count_country = count_country[country_order]
count_country

In [None]:
xlim = country_order[:10]

In [None]:
count_country_mo = (count_country.loc['Movie'])[xlim]
count_country_tv = (count_country.loc['TV Show'] * -1)[xlim]

fig, ax = plt.subplots(figsize=(12,6))
ax.bar(count_country_mo.index, count_country_mo, label='Movie')
ax.bar(count_country_tv.index, count_country_tv, label='TV Show')

ax.set_yticks([])
ax.legend(frameon=False)

remove_border()
annotate_bar(count_country_mo, 1, 20, 'bottom', 'center')
annotate_bar(count_country_tv, -1, -20, 'top', 'center')

In [None]:
count_country_percent = ((count_country / count_country.sum() * 100).round(2)).rename_axis(index='type', columns='country')[xlim]
percent_order = count_country_percent.loc['Movie'].sort_values().index

count_country_percent = count_country_percent[percent_order]
count_country_percent

In [None]:
percent_mo = count_country_percent.loc['Movie']
percent_tv = count_country_percent.loc['TV Show']

fig, ax = plt.subplots(figsize=(12,6))
ax.barh(percent_mo.index, percent_mo)
ax.barh(percent_tv.index, percent_tv, left=percent_mo)

ax.set_xticks([])
ax.legend(frameon=False)

remove_border()

for i in percent_mo.index:
    ax.annotate(f'{percent_mo[i]}%', xy=(percent_mo[i] / 2, i), va='center', ha='center', color='#ffffff')
    
for i in percent_tv.index:
    ax.annotate(f'{percent_tv[i]}%', xy=(100 - (percent_tv[i] / 2), i), va='center', ha='center', color='#ffffff')

In [None]:
df['genre'] = df['listed_in'].apply(lambda x: x.split(','))

genre_dict = dict()

for i in df['genre']:
    for genre in i:
        genre = genre.strip()
        genre_dict[genre] = genre_dict.get(genre, 0) + 1

In [None]:
age_groups = {'TV-PG': 'Older Kids', 'TV-MA': 'Adults', 'TV-Y7-FV': 'Older Kids', 'TV-Y7': 'Older Kids', 'TV-14': 'Teens', 'R': 'Adults', 'TV-Y': 'Kids', 'NR': 'Adults', 'PG-13': 'Teens', 'TV-G': 'Kids', 'PG': 'Older Kids', 'G': 'Kids', 'UR': 'Adults', 'NC-17': 'Adults'}
df['target_age_group'] = df['rating'].replace(age_groups)

age_country_dict = dict()
for i in df['target_age_group'].unique():
    age_dict = dict()
    df_temp = df[df['target_age_group'] == i]
    for j in df_temp['country']:
        j = j.split(',')
        for k in j:
            k = k.strip()
            age_dict[k] = age_dict.get(k, 0) + 1
    age_country_dict[i] = age_dict

age_country = pd.DataFrame(age_country_dict).transpose().fillna(0).astype(int).rename_axis(index='target_age_group', columns='country')[xlim]
age_country

In [None]:
age_country_percent = (age_country / age_country.sum()).round(2)
age_country_percent

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
sns.heatmap(age_country_percent, square=True, annot=True, fmt='.2%')

plt.yticks(rotation=0)

ax.set_ylabel('')    
ax.set_xlabel('')