In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from collections import Counter
%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
msno.heatmap(df)

## Clean Data

In [None]:
df.drop(['director', 'date_added', 'description'], axis=1, inplace=True)

In [None]:
df[df['rating'].isna()]

In [None]:
missing_nan = {
     67: 'TV-PG',
    2359: 'TV-14',
    3660: 'TV-MA',
    3736: 'TV-MA',
    3737: 'NR',
    3738: 'TV-MA',
    4323: 'TV-MA'
}

for id, rate in missing_nan.items():
    df.iloc[id, 6] = rate

df['rating'].isna().sum()

In [None]:
df = df[df['cast'].notna()]
df['cast'].isna().sum()

In [None]:
Counter(df['country']).most_common(1)

In [None]:
df['country'] = df['country'].fillna('United States')
df['country'].isna().sum()

In [None]:
df['main_country'] = df['country'].apply(lambda x: x.split(',')[0])
df.drop('country',axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
count = df.groupby(['main_country']).count()
most_country = count['type'].to_frame().reset_index().sort_values(by='type', ascending=False)[:10]

In [None]:
plt.figure(figsize=(15,15))
sns.barplot(x='main_country', y='type', data=most_country)
plt.ylabel('TV and Movies')
plt.xlabel('Country')
plt.show()

In [None]:
# Extract most 10 genre from netflix
top_genre = [(lambda x: x)(x.strip()) for x in ','.join(df['listed_in']).split(',')]
top_list = list(Counter(top_genre).most_common(9))
total_genre = len(top_genre)
labels = [(lambda x:x)(x[0]) for x in top_list]
labels.append('Others')
sizes = [(lambda x:x)(x[1]) for x in top_list]
sizes.append(total_genre - sum(sizes))

In [None]:
# Visualizing using matplotlib
plt.figure(figsize=(12,15))
plt.title('Percentage of Genre', fontsize=15)
plt.pie(sizes, labels=labels, autopct = '%1.1f%%', shadow=True)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.title('TV & Movie in ench decade', fontsize= 18)
sns.set_context("poster", font_scale=0.8)
sns.histplot(df['release_year'], bins=8, kde=True)
plt.show()

In [None]:
# Movies vs Tv shows

df_tv = df[df['type'] == 'TV Show'].groupby('release_year').count()
df_movies = df[df['type'] == 'Movie'].groupby('release_year').count()

In [None]:
# Visualizing using matplotlib
plt.figure(figsize=(12,8))
sns.set_context('poster', font_scale=0.8)
sns.lineplot(data = df_tv['show_id'], sizes=10)
sns.lineplot(data = df_movies['show_id'])
plt.ylabel('Count')
plt.xlabel('Relese Year')
plt.legend(['TV', 'Movie'], fontsize='large')
plt.title('TV and Movies')
plt.show()