adapted from : https://www.kaggle.com/radmirzosimov/netflix-eda-with-plotly-seaborn

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [None]:
%matplotlib inline
plt.style.use('seaborn-darkgrid')
palette =  plt.get_cmap('Set2')

# Read Dataset

In [None]:
main_df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
main_df.head()

# displaying a bar for missing no

In [None]:
nullCounts = len(main_df) - main_df.isnull().sum()
ax1 = plt.gca()
color= "royalblue"
figsize = (20,7)
fontsize=16
log = False
plot_args = {'figsize': figsize, 'fontsize': fontsize, 'log': log, 'color': color, 'ax': ax1}

(nullCounts / len(main_df)).plot.bar(**plot_args)
plt.title('Visualize Missing Values' , fontsize=30)
plt.show()

***missing value in director, cast  and country***

# display descriptions of values in columns

In [None]:
main_df.describe(include=[np.int])

In [None]:
main_df.describe(include=[np.object])

In [None]:
main_df.info()

# displaying a graph based on amount of content in country

In [None]:
countries = main_df['country'].value_counts()[main_df['country'].value_counts(normalize=True)>0.01]
countries

In [None]:
list_countries= countries.index
len(list_countries)

In [None]:
plt.figure(figsize=(20,10))
plt.title('Amount of content by country', fontsize=18)
sns.barplot(y=list_countries, x=countries.values,alpha=0.6)
plt.show()

In [None]:
plt.figure(figsize=(14,7))
figure = plt.pie(countries.sort_values(ascending=False).iloc[:20], labels=countries.index, autopct="%1.1f%%")
plt.title('Pie Chart View')
plt.show()

# looking at amount of TV shows vs movies

In [None]:
types = main_df['type'].value_counts()
types

In [None]:
plt.figure(figsize=(14,7))
sns.barplot(y=types.values, x=types.index)
plt.title('Amount of Movies vs Tv shows')
plt.show()

# Content growth throughout history

In [None]:
movies = main_df[main_df['type']=='Movie']
tv_shows= main_df[main_df['type']== 'TV Show']

movies

In [None]:
movies.duration

In [None]:
MoviesProgress = movies['release_year'].value_counts().sort_index()
TvshowProgress = tv_shows['release_year'].value_counts().sort_index()

In [None]:
plt.figure(figsize=(25,9))
fontsize=20

plt.plot(MoviesProgress.index, MoviesProgress.values, label='movies' )
plt.plot(TvshowProgress.index, TvshowProgress.values, label='tv shows')

plt.axvspan(2019,2021, alpha=0.2, color='r', label='Coronavirus')

plt.xticks(list(range(1925,2022,5)))
plt.title('Growth of content v/s Time' , fontsize=fontsize)
plt.xlabel('Time (years)', fontsize=fontsize)
plt.ylabel('Content', fontsize=fontsize)
plt.legend()
plt.show()

# Visualize rating counts

In [None]:
ratings = main_df['rating'].value_counts()

In [None]:
plt.figure(figsize=(14,7))

plt.title('Rating chart')

sns.barplot(ratings.values,ratings.index, alpha=0.6)
plt.show()

<div class='alert alert-block alert-info'> <b>Tip:</b>
tv ma is for mature audiences only.</div>

In [None]:
movies['duration'] = [int(i.split(' ')[0]) for i in movies.duration.dropna()]

In [None]:
plt.figure(figsize=(30,7))
plt.ylabel('Number of movies normalised' ,fontsize=fontsize)
sns.distplot(movies['duration'], bins=60)
plt.title('Duration distributions of films (Mins)',fontsize=fontsize)
plt.xlabel("Duration (Time)"  ,fontsize=fontsize)
plt.legend()
plt.show()

## Listing down the short movies

In [None]:
short = movies.sort_values('duration')[['title' ,'duration']].iloc[:20]

In [None]:
plt.figure(figsize=(30,9))
plt.title('Top 20 shortest movies' , fontsize=25)
plt.tick_params(labelsize=20)
sns.barplot(y=short['title'], x=short['duration'],alpha=0.6)
plt.xlabel('Duration (time) ', fontsize=fontsize)
plt.show()

In [None]:
long = movies.sort_values('duration')[['title' ,'duration']].iloc[-20:]

In [None]:
plt.figure(figsize=(30,9))
plt.title('Top 20 longest movies' , fontsize=25)
plt.tick_params(labelsize=20)
sns.barplot(y=long['title'], x=long['duration'],alpha=0.6)
plt.xlabel('Duration (time) ', fontsize=fontsize)
plt.show()

# Lets look at the shows with a single season

In [None]:
tv_shows['duration'] = [int(i.split(' ')[0]) for i in tv_shows.duration]

In [None]:
# Tv-shows with singl season
single_season = tv_shows[tv_shows['duration']==1]['title']
print(len(single_season))
print(single_season.values[:20])

In [None]:
 tv_shows[tv_shows['duration']==4]['title'].values

In [None]:
 tv_shows[tv_shows['title']=='La casa de papel']

In [None]:
tv_shows[tv_shows['title']=='La casa de papel']['description'].values

In [None]:
tvShows_seasons = tv_shows['duration'].value_counts()
plt.figure(figsize=(30, 8))
plt.title("Tv shows " , fontsize=fontsize)
plt.bar(x=tvShows_seasons.index , height=tvShows_seasons.values)
plt.xlim((0,17))
plt.xlabel('No. of seasons' , fontsize=fontsize)
plt.ylabel("Counts" ,fontsize=fontsize)
plt.plot()

In [None]:
tv_shows['duration'].value_counts()

In [None]:
directors = main_df['director'].value_counts().sort_values(ascending=False)

In [None]:
plt.title("Count of content produced by directors" , fontsize=fontsize)
plt.pie(directors.values[:10] , labels=directors.index[:10] , autopct='%1.1f%%')
plt.show()

In [None]:
directors

In [None]:
main_df.columns

In [None]:
indianMovies = movies[movies['country']=='India']
indianSeries = tv_shows[tv_shows['country']=="India"]

In [None]:
indianSeries['date_added']=pd.to_datetime(indianSeries['date_added'])
indianMovies['date_added']=pd.to_datetime(indianMovies['date_added'])

In [None]:
# latest 20 indian series 
indianSeries.sort_values(by='date_added' , ascending=False)[:20]

In [None]:
indianMovies.sort_values(by='date_added' , ascending=False)[:20]