In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings
warnings.filterwarnings('ignore')

In [None]:
netflix_df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv', parse_dates=['date_added'])
netflix_df.head(10)

### Basic info

In [None]:
netflix_df.info()

In [None]:
netflix_df['duration'].unique()

In [None]:
msno.matrix(netflix_df)

### Let's work with a DF where director's is not null and another DF where we drop director column and other null rows

In [None]:
netflix_director_df = netflix_df.loc[~netflix_df['director'].isna(),:]

In [None]:
netflix_director_df.info()

In [None]:
msno.matrix(netflix_director_df)

### Dropping the cast column and some rows where Country is na

In [None]:
netflix_director_df = netflix_director_df.drop('cast',axis=1)
netflix_director_df = netflix_director_df.dropna(subset=['country'])
msno.matrix(netflix_director_df)

### Top 10 Directors

In [None]:
netflix_director_df['director'].value_counts()[:10]

In [None]:
plt.figure(figsize=(8,8))
ax = sns.barplot(x = .value_counts()[:10].index, y = netflix_director_df['director'].value_counts()[:10])
ax.set_title('Top 10 Directors')
ax.set_ylabel('Participation')
ax.set_xlabel('Director')
ax.tick_params(axis='x', labelrotation = 90)

In [None]:
top_10_directors = netflix_director_df.loc[netflix_director_df['director'].isin(netflix_director_df['director'].value_counts()[:10].index),:]
top_10_directors

In [None]:
top_10_directors.sort_values('director')

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='type',data=top_10_directors)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='rating',data=top_10_directors, order = top_10_directors['rating'].value_counts().index)
plt.show()

## Most of the directors produced movies of Stand-up comedies

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='listed_in',data=top_10_directors, order = top_10_directors['listed_in'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='country',data=top_10_directors, order = top_10_directors['country'].value_counts().index)
plt.xticks(rotation = 90)
plt.show()

In [None]:
top_10_directors['year_added'] = top_10_directors['date_added'].dt.year

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='year_added',data=top_10_directors, binwidth=1)
plt.show()

### Now gonna drop director and cast column, and the rows where country is missing

In [None]:
netflix_df_modified = netflix_df.drop(['director','cast'],axis=1)

In [None]:
netflix_df_modified = netflix_df_modified.dropna(subset=['country'])
msno.matrix(netflix_df_modified)

In [None]:
netflix_df_modified['year_added'] = netflix_df_modified['date_added'].dt.year
netflix_df_modified['month_added'] = netflix_df_modified['date_added'].dt.month

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='type',data=netflix_df_modified, order = netflix_df_modified['type'].value_counts().index)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='rating',data=netflix_df_modified, order = netflix_df_modified['rating'].value_counts().index)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x=netflix_df_modified['listed_in'].value_counts()[:20].index,y=netflix_df_modified['listed_in'].value_counts()[:20])
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x=netflix_df_modified['country'].value_counts()[:20].index,y=netflix_df_modified['country'].value_counts()[:20])
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='year_added',data=netflix_df_modified, binwidth=1)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='month_added',data=netflix_df_modified, binwidth=1)
plt.show()

In [None]:
netflix_df_modified_tv_shows = netflix_df_modified[netflix_df_modified['type']=='TV Show']
netflix_df_modified_movies = netflix_df_modified[netflix_df_modified['type']=='Movie']

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x=netflix_df_modified_tv_shows['duration'].value_counts().index, y=netflix_df_modified_tv_shows['duration'].value_counts())
plt.xticks(rotation = 90)
plt.show()

In [None]:
netflix_df_modified_movies['duration'] = netflix_df_modified_movies['duration'].str.split().str[0].astype('int')
netflix_df_modified_movies.head()

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='duration',data=netflix_df_modified_movies, binwidth=2)
plt.show()

# In general:
* More than double of the titles are movies
* Most of them are for mature audiences
* Documentaries are most popular in general, but among directors with more titles, comedy is more outstanding
* Most titles were produced in USA
* There is a positive trend in the number of titles being added to the platform in each year
* Most of the titles are added between October and December 
* The majority of TV Shows only have 1 season
* The duration of movies is concentrating around 100 and 120 minutes