In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date,datetime
import numpy as np
from collections import Counter
sns.set_style("darkgrid")

In [None]:
import warnings
warnings.filterwarnings('ignore')

***Show Raw-data***


In [None]:
netflix= pd.read_csv('../input/netflix-shows/netflix_titles.csv')
netflix.head()

**Check type of attributes and missing datas**

In [None]:
netflix.info()

**Count Missing data**

In [None]:
netflix.isna().sum()


**Mining and Clear data**
*  Drop director and cast columns 
*  Fill NA data in Country attribute with 'United States'
*  Drop NA data 




In [None]:
netflix = netflix.drop(['director','cast'],axis=1)
netflix.country = netflix.country.fillna('United States')
netflix=netflix.dropna()
netflix.head()

**Analysis Amount of Movie and TV Show data up to now**

In [None]:
type_netflix = netflix.groupby('release_year').type.value_counts()
type_netflix_20 = type_netflix[ type_netflix.index.get_level_values(0) > 2000]

In [None]:
type_netflix_year=type_netflix_20.index.get_level_values(0)
type_netflix_type=type_netflix_20.index.get_level_values(1)

In [None]:
fig1, a1 = plt.subplots(1,3,figsize=(20,10))
a1[0].hist(x=netflix[netflix.type == 'Movie'].type)
a1[0].hist(x=netflix[netflix.type == 'TV Show'].type)
a1[0].set_title('Amount Movie and TV Show')
a1[1].pie(netflix.type.value_counts(),labels=netflix.type.unique(),autopct='%1.1f%%')
a1[1].set_title('Movie and TV Show by percentile')
a1[1].legend()
a1[2].plot(type_netflix_year.unique(),type_netflix_20[type_netflix_type == 'Movie'],label='Movie',marker='o')
a1[2].plot(type_netflix_year.unique(),type_netflix_20[type_netflix_type == 'TV Show'],label='TV Show',marker='o')
a1[2].set(xlabel='Year',ylabel='Amount',xticks=[2000+5*i for i in range(0,5)],title='Amount Movie and TV Show by each year')
a1[2].legend()

**Top 20 Countries have Most Contribute**

In [None]:
country=Counter()
for i in [Counter(c.split(', ')) for c in netflix.country]:
  country +=i
country = pd.Series(country).sort_values(ascending=False)
top20country=country.head(20)

In [None]:
fig2, a2 = plt.subplots(1,2,figsize=(20,10))
sns.barplot(y=top20country.index , x = top20country,ax=a2[0])
a2[0].set(xlabel='Count',title='Number movie and TV show of top20 country')
a2[1].pie(top20country,labels=top20country.index,autopct='%1.1f%%')
a2[1].set(title='Percent movie and TV show of top20 country')
a2[1].legend(loc='left', bbox_to_anchor=(1,1), ncol=2)

**Amount Movie and TV Show release by year**

In [None]:
fig3, a3 = plt.subplots(1,2,figsize=(15,5))
a3[0].hist(x=netflix.release_year,bins=100)
a3[0].set(title='From 1925 to 2020',xlabel='Year')
a3[1].hist(x=netflix[netflix.release_year > 1990].release_year,bins=30)
a3[1].set(title='After 1990',xlabel='Year')

**Duration infomation of Movie and TV Show**


*   Average, Max, Min duaration of Movie 



In [None]:
mv=netflix[netflix.type == 'Movie']
mv.duration = mv.duration.str.strip('min').astype(int)
tv=netflix[netflix.type == 'TV Show']
tv.duration = tv.duration.str.strip(' Seations').astype(int)
Avg_min = mv.duration.sum()/mv.duration.count()
Avg_min , max(mv.duration), min(mv.duration)



*   Duration of Movie And TV Show


In [None]:
fig4, a4 = plt.subplots(1,2,figsize=(15,5))
a4[0].hist(x=mv.duration,bins=30)
a4[0].set(xlabel='Minutes',title='Duration of Movie')
a4[1].hist(x=tv.duration,bins=15)
a4[1].set(xlabel='Seations',title='Duration of TV Show')

**Statistic Rating of Movie and TV Show by amount and percentile**



In [None]:
fig5, a5 = plt.subplots(2,2,figsize=(20,10))
a5[0,1].pie(x=mv.rating.value_counts(),labels=mv.rating.value_counts().sort_values(ascending=False).index,autopct='%1.1f%%')
a5[0,1].set(title='Movie Rating')
a5[0,1].legend(loc='left', bbox_to_anchor=(1,1), ncol=2)
sns.barplot(x=mv.rating.value_counts().sort_values(ascending=False),y=mv.rating.value_counts().sort_values(ascending=False).index,ax=a5[0,0])
a5[0,0].set(title='Movie Rating by percentile')
a5[1,1].pie(x=tv.rating.value_counts(),labels=tv.rating.value_counts().sort_values(ascending=False).index,autopct='%1.1f%%')
a5[1,1].set(title='TV Show Rating by percentile')
a5[1,1].legend(loc='left', bbox_to_anchor=(1,1), ncol=2)
sns.barplot(x=tv.rating.value_counts().sort_values(ascending=False),y=tv.rating.value_counts().sort_values(ascending=False).index,ax=a5[1,0])
a5[1,0].set(title='TV Show Rating')

**Statistic Category of Movie and TV Show**

In [None]:
list_mv=Counter()
for i in [Counter(c.split(', ')) for c in mv.listed_in]:
  list_mv +=i
list_mv = pd.Series(list_mv).sort_values(ascending=False)

In [None]:
list_tv=Counter()
for i in [Counter(c.split(', ')) for c in tv.listed_in]:
  list_tv +=i
list_tv = pd.Series(list_tv).sort_values(ascending=False)

In [None]:
fig6, a6 = plt.subplots(2,2,figsize=(20,10))
sns.barplot(y=list_mv.index , x = list_mv,ax=a6[0,0])
a6[0,0].set(xlabel='Count',title='Amount Movie for each Category')
a6[0,1].pie(list_mv,labels=list_mv.index,autopct='%1.1f%%')
a6[0,1].set(title='Percent Movie for each Category')
a6[0,1].legend(loc='left', bbox_to_anchor=(1,1), ncol=2)

sns.barplot(y=list_tv.index , x = list_tv,ax=a6[1,0])
a6[1,0].set(xlabel='Count',title='Amount TV Show for each Category')
a6[1,1].pie(list_tv,labels=list_tv.index,autopct='%1.1f%%')
a6[1,1].set(title='Percent TV Show for each Category ')
a6[1,1].legend(loc='left', bbox_to_anchor=(1,1), ncol=2)