In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libriaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.text import Text
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import Rectangle, Polygon
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from wordcloud import WordCloud, STOPWORDS

import geopandas as gpd
from shapely.geometry import Point, LineString

import folium
from folium import Marker, GeoJson
from folium.plugins import MarkerCluster, HeatMap

# Get & Prepare Data

In [None]:
df=pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
#chech duplicates in show_id column
show_id_dupl=len(df['show_id'])!=len(set(df['show_id']))
show_id_dupl

In [None]:
df.set_index('show_id', inplace=True)
df.head()

# Top Actors/Actresses

In [None]:
#Top Actors/Actresses
actor_df=df[['cast','type']]
actor_df.head()
actor_df.dropna(subset=['cast'], inplace=True)
actor_df

In [None]:
new_actor_df=actor_df['cast'].str.split(',', expand=True)
new_actor_df

In [None]:
splited_actor_df=pd.concat([actor_df, new_actor_df], axis=1)
splited_actor_df.drop(columns=['cast'], inplace=True)
value=list(splited_actor_df.columns[1:])
splited_actor_df=pd.melt(splited_actor_df, id_vars=['type'], value_vars=value)
splited_actor_df['value']=splited_actor_df['value'].str.strip()
splited_actor_df

In [None]:
print('null value\n', splited_actor_df.isnull().sum(axis=0), '\n')
print('NaN value\n', splited_actor_df.isna().sum(axis=0), '\n')
print(splited_actor_df.shape, '\n')

In [None]:
splited_actor_df.dropna(inplace=True)
splited_actor_df

In [None]:
print('null value\n',splited_actor_df.isnull().sum(axis=0),'\n')
print('NaN value\n',splited_actor_df.isna().sum(axis=0),'\n')
print(splited_actor_df.shape, '\n')

In [None]:
#Let's keep top 5 for Movie bars
top_m=splited_actor_df[splited_actor_df['type']=='Movie']
top_m=top_m[['variable','value']].groupby(by='value').count()
top_m.sort_values(by='variable', ascending=False, inplace=True)

top_m=top_m.head(5)
top_m.sort_values(by='variable', ascending=True, inplace=True)

top_m_max=top_m['variable'].max()
top_m['color'] = np.where(top_m['variable']==top_m_max, 'darkred', 'grey')
top_m.reset_index(inplace=True)

top_m

In [None]:
#Let's keep top 5 for TV show bars
top_s=splited_actor_df[splited_actor_df['type']=='TV Show']
top_s=top_s[['variable','value']].groupby(by='value').count()
top_s.sort_values(by='variable', ascending=False, inplace=True)

top_s=top_s.head(5)
top_s.sort_values(by='variable', ascending=True, inplace=True)

top_s_max=top_s['variable'].max()
top_s['color'] = np.where(top_s['variable']==top_s_max, 'darkred', 'grey')
top_s.reset_index(inplace=True)

top_s

In [None]:
#Creat DFs for word cloud
#Let's keep top 100 for word cloud
text=splited_actor_df[['variable','value']].groupby(by='value').count()
text.sort_values(by='variable', ascending=False, inplace=True)

text=text.head(100)
text['variable']=text['variable']**4
text.reset_index(inplace=True)

#Makes the circle using numpy
x, y = np.ogrid[:1000, :1000]

mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)

#Make a word cloud
wc=WordCloud(background_color='black',
             #width = 1920, height = 1080,
             random_state=1,
             contour_color='Red',
             colormap='Reds_r',
             mask=mask,
             collocations=False).generate_from_frequencies(dict(text.values.tolist()))

fig=plt.figure(figsize=(20,10), facecolor='black')
fig.suptitle('Netflix Actors Analisis', color ='white', fontweight='bold', fontsize=20)
gs = fig.add_gridspec(2, 2)
plt.tight_layout()
ax = [None for _ in range(4)]

ax[0] = fig.add_subplot(gs[:, 0]) 
ax[0].imshow(wc, interpolation="bilinear")
ax[0].axis("off")

ax[1] = fig.add_subplot(gs[0, 1])
ax[1].barh(y=top_m['value'], width=top_m['variable'], align='center', color=top_m['color'])

for i, row in top_m.iterrows():
    #print(i, row['value'], ': ', row['variable'], ' Movies')
    ax[1].text(x=0.1, y=i, s=row['value']+': '+str(row['variable'])+' Movies', fontweight='bold', color='white')

ax[1].set_title('\nTop 5 Movie Actors', color ='darkred', fontweight='bold', fontsize=15)
ax[1].axis("off")

ax[2] = fig.add_subplot(gs[1, 1]) 
ax[2].barh(y=top_s['value'], width=top_s['variable'], align='center', color=top_s['color'])

for i, row in top_s.iterrows():
    #print(i, row['value'], ': ', row['variable'], ' Movies')
    ax[2].text(x=0.1, y=i, s=row['value']+': '+str(row['variable'])+' TV-Shows', fontweight='bold', color='white')
    
ax[2].set_title('\nTop 5 TV-Show Actors', color ='darkred', fontweight='bold', fontsize=15)
ax[2].axis("off")

plt.show()

# Top Directors

In [None]:
#Top Director
director_df=df[['director','type']]
director_df.head()
director_df.dropna(subset=['director'], inplace=True)
director_df

In [None]:
new_director_df=director_df['director'].str.split(',', expand=True)
new_director_df

In [None]:
splited_director_df=pd.concat([director_df, new_director_df], axis=1)
splited_director_df.drop(columns=['director'], inplace=True)
value=list(splited_director_df.columns[1:])
splited_director_df=pd.melt(splited_director_df, id_vars=['type'], value_vars=value)
splited_director_df['value']=splited_director_df['value'].str.strip()
splited_director_df

In [None]:
print('null value\n', splited_director_df.isnull().sum(axis=0), '\n')
print('NaN value\n', splited_director_df.isna().sum(axis=0), '\n')
print(splited_director_df.shape, '\n')

In [None]:
splited_director_df.dropna(inplace=True)
splited_director_df

In [None]:
print('null value\n', splited_director_df.isnull().sum(axis=0), '\n')
print('NaN value\n', splited_director_df.isna().sum(axis=0), '\n')
print(splited_director_df.shape, '\n')

In [None]:
#Let's keep top 5 for Movie bars
top_d_m=splited_director_df[splited_director_df['type']=='Movie']
top_d_m=top_d_m[['variable','value']].groupby(by='value').count()
top_d_m.sort_values(by='variable', ascending=False, inplace=True)

top_d_m=top_d_m.head(5)
top_d_m.sort_values(by='variable', ascending=True, inplace=True)

top_d_m_max=top_d_m['variable'].max()
top_d_m['color'] = np.where(top_d_m['variable']==top_d_m_max, 'darkred', 'grey')
top_d_m.reset_index(inplace=True)

top_d_m

In [None]:
#Let's keep top 5 for TV show bars
top_d_s=splited_director_df[splited_director_df['type']=='TV Show']
top_d_s=top_d_s[['variable','value']].groupby(by='value').count()
top_d_s.sort_values(by='variable', ascending=False, inplace=True)

top_d_s=top_d_s.head(5)
top_d_s.sort_values(by='variable', ascending=True, inplace=True)

top_d_s_max=top_d_s['variable'].max()
top_d_s['color'] = np.where(top_d_s['variable']==top_d_s_max, 'darkred', 'grey')
top_d_s.reset_index(inplace=True)

top_d_s

In [None]:
#Creat DFs for word cloud
#Let's keep top 100 for word cloud
text_d=splited_director_df[['variable','value']].groupby(by='value').count()
text_d.sort_values(by='variable', ascending=False, inplace=True)

text_d=text_d.head(100)
text_d['variable']=text_d['variable']**4
text_d.reset_index(inplace=True)

#Makes the circle using numpy
x, y = np.ogrid[:1000, :1000]

mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)

#Make a word cloud
wc_d=WordCloud(background_color='black',
             #width = 1920, height = 1080,
             random_state=1,
             contour_color='Red',
             colormap='Reds_r',
             mask=mask,
             collocations=False).generate_from_frequencies(dict(text_d.values.tolist()))

fig=plt.figure(figsize=(20,10), facecolor='black')
fig.suptitle('Netflix Directors Analisis', color ='white', fontweight='bold', fontsize=20)
gs = fig.add_gridspec(2, 2)
plt.tight_layout()
ax = [None for _ in range(4)]

ax[0] = fig.add_subplot(gs[:, 0]) 
ax[0].imshow(wc_d, interpolation="bilinear")
ax[0].axis("off")

ax[1] = fig.add_subplot(gs[0, 1])
ax[1].barh(y=top_d_m['value'], width=top_d_m['variable'], align='center', color=top_d_m['color'])

for i, row in top_d_m.iterrows():
    #print(i, row['value'], ': ', row['variable'], ' Movies')
    ax[1].text(x=0.1, y=i, s=row['value']+': '+str(row['variable'])+' Movies', fontweight='bold', color='white')

ax[1].set_title('\nTop 5 Movie Directors',color ='darkred', fontweight='bold', fontsize=15)
ax[1].axis("off")

ax[2] = fig.add_subplot(gs[1, 1]) 
ax[2].barh(y=top_d_s['value'], width=top_d_s['variable'], align='center', color=top_d_s['color'])

for i, row in top_d_s.iterrows():
    #print(i, row['value'], ': ', row['variable'], ' Movies')
    ax[2].text(x=0.1, y=i, s=row['value']+': '+str(row['variable'])+' TV-Shows', fontweight='bold', color='white')
    
ax[2].set_title('\nTop 5 TV-Show Directors',color ='darkred', fontweight='bold', fontsize=15)
ax[2].axis("off")

plt.show()

# Top Genres

In [None]:
#Top Genres
genres_df=df[['listed_in','type']]
genres_df.head()
genres_df.dropna(subset=['listed_in'], inplace=True)
genres_df

In [None]:
new_genres_df=genres_df['listed_in'].str.split(',', expand=True)
new_genres_df

In [None]:
splited_genres_df=pd.concat([genres_df, new_genres_df], axis=1)
splited_genres_df.drop(columns=['listed_in'], inplace=True)
value=list(splited_genres_df.columns[2:])
splited_genres_df=pd.melt(splited_genres_df, id_vars=['type'], value_vars=value)
splited_genres_df['value']=splited_genres_df['value'].str.strip()
splited_genres_df

In [None]:
print('null value\n', splited_genres_df.isnull().sum(axis=0), '\n')
print('NaN value\n', splited_genres_df.isna().sum(axis=0), '\n')
print(splited_genres_df.shape, '\n')

In [None]:
splited_genres_df.dropna(inplace=True)
splited_genres_df

In [None]:
print('null value\n', splited_genres_df.isnull().sum(axis=0), '\n')
print('NaN value\n', splited_genres_df.isna().sum(axis=0), '\n')
print(splited_genres_df.shape, '\n')

In [None]:
#Let's keep top 5 for Movie bars
top_g_m=splited_genres_df[splited_genres_df['type']=='Movie']
top_g_m=top_g_m[['variable','value']].groupby(by='value').count()
top_g_m.sort_values(by='variable', ascending=False, inplace=True)

top_g_m=top_g_m.head(5)
top_g_m.sort_values(by='variable', ascending=True, inplace=True)

top_g_m_max=top_g_m['variable'].max()
top_g_m['color'] = np.where(top_g_m['variable']==top_g_m_max, 'darkred', 'grey')
top_g_m.reset_index(inplace=True)

top_g_m

In [None]:
#Let's keep top 5 for TV show bars
top_g_s=splited_genres_df[splited_genres_df['type']=='TV Show']
top_g_s=top_g_s[['variable','value']].groupby(by='value').count()
top_g_s.sort_values(by='variable', ascending=False, inplace=True)

top_g_s=top_g_s.head(5)
top_g_s.sort_values(by='variable', ascending=True, inplace=True)

top_g_s_max=top_g_s['variable'].max()
top_g_s['color'] = np.where(top_g_s['variable']==top_g_s_max, 'darkred', 'grey')
top_g_s.reset_index(inplace=True)

top_g_s

In [None]:
#Creat DFs for word cloud
#Let's keep top 100 for word cloud
text_g=splited_genres_df[['variable','value']].groupby(by='value').count()
text_g.sort_values(by='variable', ascending=False, inplace=True)

text_g=text_g.head(100)
text_g['variable']=text_g['variable']
text_g.reset_index(inplace=True)

#Makes the circle using numpy
x, y = np.ogrid[:1000, :1000]

mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)

#Make a word cloud
wc_g=WordCloud(background_color='black',
             #width = 1920, height = 1080,
             random_state=1,
             contour_color='Red',
             colormap='Reds_r',
             mask=mask,
             collocations=False).generate_from_frequencies(dict(text_g.values.tolist()))

fig=plt.figure(figsize=(20,10), facecolor='black')
fig.suptitle('Netflix Genres Analisis', color ='white', fontweight='bold', fontsize=20)
gs = fig.add_gridspec(2, 2)
plt.tight_layout()
ax = [None for _ in range(4)]

ax[0] = fig.add_subplot(gs[:, 0]) 
ax[0].imshow(wc_g, interpolation="bilinear")
ax[0].axis("off")

ax[1] = fig.add_subplot(gs[0, 1])
ax[1].barh(y=top_g_m['value'], width=top_g_m['variable'], align='center', color=top_g_m['color'])

for i, row in top_g_m.iterrows():
    #print(i, row['value'], ': ', row['variable'], ' Movies')
    ax[1].text(x=0.1, y=i, s=row['value']+': '+str(row['variable'])+' Movies', fontweight='bold', color='white')

ax[1].set_title('\nTop 5 Movie Genreses',color ='darkred', fontweight='bold', fontsize=15)
ax[1].axis("off")

ax[2] = fig.add_subplot(gs[1, 1]) 
ax[2].barh(y=top_g_s['value'], width=top_g_s['variable'], align='center', color=top_g_s['color'])

for i, row in top_g_s.iterrows():
    #print(i, row['value'], ': ', row['variable'], ' Movies')
    ax[2].text(x=0.1, y=i, s=row['value']+': '+str(row['variable'])+' TV-Shows', fontweight='bold', color='white')
    
ax[2].set_title('\nTop 5 TV-Show Genreses', color ='darkred', fontweight='bold', fontsize=15)
ax[2].axis("off")

plt.show()

# Top Counties

In [None]:
#Lest create dataframe for counties
country_df=df[['country','type']]
country_df.head()

In [None]:
new_country_df=country_df['country'].str.split(',', expand=True)
new_country_df

In [None]:
splited_country_df=pd.concat([country_df, new_country_df], axis=1)
splited_country_df.drop(columns=['country'], inplace=True)
value=list(splited_country_df.columns[1:])
splited_country_df=pd.melt(splited_country_df, id_vars=['type'], value_vars=value)
splited_country_df['value']=splited_country_df['value'].str.strip()
splited_country_df.head()

In [None]:
print('null value\n', splited_country_df.isnull().sum(axis=0), '\n')
print('NaN value\n', splited_country_df.isna().sum(axis=0), '\n')
print(splited_country_df.shape, '\n')

In [None]:
splited_country_df.dropna(inplace=True)
print('null value\n', splited_country_df.isnull().sum(axis=0), '\n')
print('NaN value\n', splited_country_df.isna().sum(axis=0), '\n')
print(splited_country_df.shape, '\n')

In [None]:
country_df=pd.pivot_table(splited_country_df, values='variable', index='value', columns='type', aggfunc=len)
print('null value\n', country_df.reset_index().isnull().sum(axis=0), '\n')
print('NaN value\n', country_df.reset_index().isna().sum(axis=0), '\n')
print(country_df.shape, '\n')
country_df.replace(to_replace=np.nan, value=0, inplace=True)
print('null value\n', country_df.reset_index().isnull().sum(axis=0), '\n')
print('NaN value\n', country_df.reset_index().isna().sum(axis=0), '\n')
print(country_df.shape, '\n')
country_df

In [None]:
country_df['Total']=country_df['Movie']+country_df['TV Show']
country_df.sort_values(by='Total', ascending=False, inplace=True)
country_df

In [None]:
# generate country code  based on country name 
import pycountry 
def alpha3code(column):
    CODE=[]
    for country in column:
        try:
            code=pycountry.countries.get(name=country).alpha_3
           # .alpha_3 means 3-letter country code 
           # .alpha_2 means 2-letter country code
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE
# create a column for code 
country_df['CODE']=alpha3code(country_df.index)
country_df.head(50)

In [None]:
print('null value\n', country_df.reset_index().isnull().sum(axis=0), '\n')
print('NaN value\n', country_df.reset_index().isna().sum(axis=0), '\n')
print(country_df.shape, '\n')

In [None]:
# first let us merge geopandas data with our data
# 'naturalearth_lowres' is geopandas datasets so we can use it directly
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# rename the columns so that we can merge with our data
world.columns=['pop_est', 'continent', 'name', 'CODE', 'gdp_md_est', 'geometry']
# then merge with our data 
country_df=pd.merge(world,country_df, how='left', on='CODE')
country_df.sort_values(by='Total', ascending=False, inplace=True)
#country_df[['Movie','TV Show','Total']]=country_df[['Movie','TV Show','Total']].replace(to_replace=np.nan, value=0)
country_df

In [None]:
print('null value\n', country_df.reset_index().isnull().sum(axis=0), '\n')
print('NaN value\n', country_df.reset_index().isna().sum(axis=0), '\n')
print(country_df.shape, '\n')

In [None]:
country_df=country_df[country_df['continent']!='Antarctica']
country_df['continent'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(20, 10), facecolor='black')
country_df.plot(ax=ax,
                column='Total',
                scheme='quantiles',
                cmap='Reds',
                legend=True)
plt.axis('off')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 10), facecolor='black')

country_df.iloc[[0]].plot(ax=ax,
                column='Total',
                scheme='quantiles',
                cmap='Reds',
                legend=False)
plt.axis('off')
plt.show()

In [None]:
fig=plt.figure(figsize=(20,10), facecolor='black')
fig.suptitle('Netflix Countries Analisis', color ='white', fontweight='bold', fontsize=20)
gs = fig.add_gridspec(3, 3)
plt.tight_layout()
ax = [None for _ in range(6)]

ax[0] = fig.add_subplot(gs[1:, :-1])

country_df.plot(ax=ax[0],
                figsize=(12,10),
                column='Total',
                scheme='quantiles',
                cmap='Reds',
                legend_kwds={'loc':'lower left'},
                legend=True,
                edgecolor='white',
                missing_kwds={"color": "white",
                              "edgecolor": "darkred",
                              "hatch": "///",
                              "label": "Missing values"})

ax[0].set_facecolor('black')

ax[1] = fig.add_subplot(gs[0, 0])
ax[2] = fig.add_subplot(gs[0, 1])
ax[3] = fig.add_subplot(gs[0, 2])
ax[4] = fig.add_subplot(gs[1, 2])
ax[5] = fig.add_subplot(gs[2, 2])

for i in range(0,5):
    row=country_df.iloc[[i]]    
    row.plot(ax=ax[i+1],
                column='Total',
                categorical=True,
                cmap='Reds_r',
                edgecolor='white',
                legend=True)
    title=country_df.iloc[i]['name']
    ax[i+1].set_title('\n'+str(i+1)+' - '+title, color ='darkred', fontweight='bold', fontsize=15)
    ax[i+1].set_facecolor('black')
    #ax[i+1].set_aspect('equal', adjustable='datalim')

plt.show()