In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
df.head()

* ## Understanding the Data

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
## unique values %
round(df.nunique()/df.count()*100,2)

In [None]:
## Lets check for those values which has less than 10% values
plt.figure(figsize=(6,6))
plt.title('Comparison between TV Shows and Movies')
sns.countplot(df['type'])
plt.show()

In [None]:
## sub breaking the years into decade
df['decade'] = pd.cut(x = df['release_year'],bins = [1920,1930,1940,1950,1960,1970,1980,1990,2000,2010,2020])

#### As you can see that most of the content was released in the last 10 years

In [None]:
groupByDecade = df.groupby(['decade'])
groupByDecade['show_id'].count()

In [None]:
df_movies = df[df['type'] == 'Movie']
contentMovieByYear = df_movies.groupby(['release_year'])
plt.title('Top 10 years releasing the most movies')
contentMovieByYear.show_id.count().sort_values(ascending=False).head(10).plot(kind='bar')
plt.show()

In [None]:
df_shows = df[df['type'] == 'TV Show']
contentShowByYear = df_shows.groupby(['release_year'])
plt.title('Top 10 years releasing the most TV Shows')
contentShowByYear.show_id.count().sort_values(ascending=False).head(10).plot(kind='bar')
plt.show()

In [None]:
## for country

top_contri = df_movies.groupby(['country'])['show_id'].count().sort_values(ascending=False).head(5)
print(top_contri)
top_contri = top_contri.reset_index(name='count')
top_contri

In [None]:
plt.figure(figsize=(8,6))
plt.title('Top Countries Producing the Movies')
sns.barplot(x = 'country', y='count', data=top_contri)
plt.show()

In [None]:
top_contri_shows = df_shows.groupby(['country'])['show_id'].count().sort_values(ascending=False).head(5)
top_contri_shows = top_contri_shows.reset_index(name='count')
top_contri_shows

In [None]:
plt.figure(figsize=(8,6))
plt.title('Top Countries Producing the TV Shows')
sns.barplot(x = 'country', y='count', data=top_contri_shows)
plt.show()

1. USA is top country in producing TV Shows.
2. Japan and South Korea are also in Top 5 Countries to produce TV Shows.

### Exploring the director

In [None]:
director_df = df.copy()
director_df.director = director_df.director.str.strip().str.split(',')

In [None]:
director_df = director_df.explode('director')

In [None]:
top_dir = director_df[director_df['type'] == 'Movie'].groupby(['director'])['show_id'].count().sort_values(ascending=False).head(8)
top_dir = top_dir.reset_index()
top_dir.rename(columns={'show_id':'count'}, inplace=True)

In [None]:
plt.figure(figsize=(14,6))
plt.title('Top Movie Producing Directors')
sns.barplot(x=top_dir['director'], y=top_dir['count'])
plt.show()

### Exploring the Duration of the Movies

In [None]:
## Removing the min part
df_movies.duration = df_movies.duration.str[0:-4]
df_movies.duration = df_movies.duration.astype('float')
df_movies.head()


In [None]:
top_movies = df_movies.groupby(['country'])['duration'].agg(['count','mean']).sort_values(by='count',ascending=False).head(10)
top_movies = top_movies.reset_index()
top_movies.rename(columns={'mean':'Average Duration'},inplace=True)
top_movies

In [None]:
plt.figure(figsize=(12,6))
plt.title('Average Duration of the Top 10 Countries producing Movies')
sns.barplot(y = top_movies['Average Duration'], x = top_movies['country'])
plt.show()

1. Indian Movies are of longer duration as compared to other Country movies