In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer

from datetime import datetime as dt

from itertools import compress

In [None]:
nf=pd.read_csv("../input/netflix-shows/netflix_titles.csv")

In [None]:
nf.head()

# Preparation

In [None]:
nf.info()

In [None]:
nf.drop("show_id",axis=1,inplace=True)
nf.director.fillna("Unknown",inplace=True)
nf.dropna(inplace=True)

nf.rename({"listed_in":"genre"},inplace=True,axis=1)
nf['country']=nf.country.str.split(", ",expand=True)[0]


nf[["duration","duration_type"]]=nf.duration.str.split(" ",expand=True)
nf=nf.astype({"duration":"int64"})


nf.genre=nf.genre.apply(lambda x:x.split(", "))

mlb=MultiLabelBinarizer()
genre_mlb=pd.DataFrame(
    mlb.fit_transform(nf.genre),
    columns=mlb.classes_,
    index=nf.index).add_prefix("genre_")

nf.drop("genre",axis=1,inplace=True)

nf=pd.concat([nf,genre_mlb],axis=1)


nf.date_added=nf.date_added.apply(lambda x:dt.strptime(x.strip(),"%B %d, %Y"))
nf['year_added']=nf.date_added.dt.year
nf['month_added']=nf.date_added.dt.month
nf['day_added']=nf.date_added.dt.month
nf['release_added_diff']=abs(nf.release_year-nf.year_added)

genre_list=nf.loc[:,nf.columns.str.startswith("genre")].columns
genre_list=list(genre_list)

movie_bool=(nf.loc[nf.type=="Movie",genre_list].sum()>0).values
tv_bool=(nf.loc[nf.type=="TV Show",genre_list].sum()>0).values

genre_list_tv=list(compress(genre_list,tv_bool))
genre_list_movie=list(compress(genre_list,movie_bool))

stacked_cast=nf.cast.str.split(", ",expand=True).stack().reset_index(level=1,drop=True)
stacked_cast_us=nf.loc[
    nf.country=="United States","cast"].str.split(", ",expand=True).stack().reset_index(level=1,drop=True)

top_ten_countries=nf.country.value_counts().index[0:10]
top_ten_countries=list(top_ten_countries)

# Visualization

# Pie Chart: Movies vs TV Shows

In [None]:
fig,axes=plt.subplots(figsize=(12,6))

plt.pie(nf.type.value_counts(),explode=(0.05,0.05),colors=['#66c2a5','#fc8d62'],autopct="%1.1f%%",labels=nf.type.value_counts().index)
plt.title("Movies vs TV Shows")
plt.show()

# Distribution of Duration

In [None]:
fig,ax=plt.subplots(1,2,figsize=(16,8))


sns.histplot(ax=ax[0],x="duration",data=nf.loc[nf.type=="Movie",:],bins=50)
sns.histplot(ax=ax[1],x="duration",data=nf.loc[nf.type=="TV Show",:],bins=50)

# Movies and TV Shows by Top 10 Countries

In [None]:
fig,ax=plt.subplots(figsize=(12,7))

sns.countplot(y="country",hue="type",order=nf.country.value_counts().index[0:10],data=nf,palette="Set2")

In [None]:
fig,ax=plt.subplots(figsize=(12,7))

sns.scatterplot(x=nf.release_year,y=nf.year_added,hue=nf.type)

plt.show()

# Top 15 Directors Worldwide

In [None]:
fig,axes=plt.subplots(figsize=(12,6))
sns.barplot(
    x=nf.loc[nf.director!="Unknown","director"].value_counts().values[0:15],
    y=nf.loc[nf.director!="Unknown","director"].value_counts().index[0:15],palette="Set2")
plt.title("Top 15 Directors Worldwide")
plt.ylabel("Director")
plt.xlabel("Movies or TV Shows")
plt.show()

# Top 15 Cast Members Worldwide and US

In [None]:
fig,ax=plt.subplots(2,1,figsize=(14,12))
sns.countplot(ax=ax[0],y=stacked_cast,order=stacked_cast.value_counts().index[0:15],palette="Set2")
sns.countplot(ax=ax[1],y=stacked_cast_us,order=stacked_cast_us.value_counts().index[0:15],palette="Set2")

ax[0].set_title("Top 15 Cast Members Worldwide")
ax[1].set_title("Top 15 US Cast Members")

ax[0].set_xlabel("Number of Titles")
ax[0].set_ylabel("Cast Member")
ax[1].set_xlabel("Number of Titles")
ax[1].set_ylabel("Cast Member")
plt.show()

# Top 15 Countries for Movies and TV Shows

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
sns.countplot(y=nf.country,order=nf.country.value_counts().index[0:15],palette="Set2")

plt.title("Top 15 Countries for Movies and TV Shows")
plt.xlabel("Number of Titles")
plt.ylabel("Country")
plt.show()

# Content Added by Year

In [None]:
fig,ax=plt.subplots(figsize=(12,6))

sns.countplot(y="year_added",hue="type",data=nf,palette="Set2")
plt.show()

# Rating by Genre

In [None]:
nf.loc[:,genre_list+["rating"]].groupby("rating").sum().transpose().style.background_gradient()

# Distribution of Ratings

In [None]:
fig,ax=plt.subplots(figsize=(12,6))

sns.countplot(y="rating",hue="type",data=nf,palette="Set2")

# Content Added by Year and Month

In [None]:
pd.crosstab(nf.year_added,nf.month_added).style.background_gradient()

# Top 10 Countries by Movie and TV Genre

In [None]:
nf.loc[nf.country.isin(top_ten_countries),
    genre_list+["country"]].groupby("country").sum().transpose().style.background_gradient()