In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Netflix EDA
Performing Exploratory Data Analysis to understand the dataset

### Tasks
- Understand the dataset, Types and missing values.
- Clean the dataset and handle the missing values
- Perform data visualization
- Create final summary report

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# reading our dataset
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna().sum()

## Adjust data types and fill in missing values
Verify data types make sense. All except release_year are object/string.

### The following do not require any fills:
* type
* title
* release_year
* listed_in
* description

### The following are misiing data:
* duration
* rating 
* date_added
* cast
* country
* director
* Check data types, update types where needed and proceed


Update date_added to datetime and check

In [None]:
# cover the data type from object to datetime 
df['date_added'] = df['date_added'].str.strip()
df['date_added'] = pd.to_datetime(df['date_added'], format='%B %d, %Y')

In [None]:
df.head()

### Hnadling missing values
rating,cast,country and director have nulls filled with 'Unavailable'

In [None]:
df.fillna({'rating':'Unavaiable','cast':'Unavaiable','country':'Unavaiable','director':'Unavaiable'},inplace=True)
df.isna().sum()

For nulls in date_added, missing date_added is to be substituted in with the most recent date from date_added. <br>
This is because Netflix has the tendency to add more content over time.Other viable options would be finding actual dates and inputting them manually or dropping from results since amount of missing data is rather small

In [None]:
df[df.date_added.isnull()]

In [None]:
most_recent_entry_date = df['date_added'].max()
df.fillna({'date_added': most_recent_entry_date}, inplace=True)

Proof of concept that the date filled in the null date_added from a show_id example that previously was missing the date_added

In [None]:
df[df.show_id == 's6067']

## Additional data cleansing

### Durations data input error
The missing duration are all movies by Louis C.K. 

Normally, we would likely fill duration with the mean duration of movies from the table.In this case it appears that the actual duration was input into the rating column, so one solution is to move the rating data into the duration and make the rating information 'Unavailable'like the other nulls

In [None]:
df[df.duration.isnull()]

Check to make sure there is no other content with the same director to avoid accident overwriting

In [None]:
df[df.director == 'Louis C.K.'].head()

Overwrite and check

In [None]:
df.loc[df['director'] == 'Louis C.K.','duration'] = df['rating']
df[df.director == 'Louis C.K.'].head()

Second overwrite and check

In [None]:
df.loc[df['director'] == 'Louis C.K.','rating'] = 'Unavailable'
df[df.director == 'Louis C.K.'].head()

# Visualizations
Let's take a look at types of shows that has been watch on Netflix

In [None]:
df.type.value_counts()

In [None]:
sns.countplot(x='type',data=df)
plt.title('Count Vs Type of Shows')

On Netflix there are more no. of Movies as compared to TV shows

## Country Analysis

In [None]:
df['country'].value_counts().head(10)

In [None]:
plt.figure(figsize =(12,6))
sns.countplot(y='country', order=df['country'].value_counts().index[0:10],data=df)
plt.title('Country Wise Content on Netflix')

In [None]:
movie_countries = df[df['type']=='Movie']
tv_show_countries = df[df['type']=='TV Show']

In [None]:
plt.figure(figsize = (12,6))
sns.countplot(y='country', order=df['country'].value_counts().index[0:10],data=movie_countries)
plt.title('Top 10 countries producing movies in Netflix')


plt.figure(figsize = (12,6))
sns.countplot(y='country', order=df['country'].value_counts().index[0:10],data=tv_show_countries)
plt.title('Top 10 countries producing TV shows in Netflix')

Let's check what are the major ratings given to Netflix shows

In [None]:
df.rating.value_counts()

In [None]:
plt.figure(figsize = (9,6))
sns.countplot(x='rating',order = df['rating'].value_counts().index[0:10],data=df)
plt.title('Ratings of shows on Netflix vs Count')

Most of the shows has TV-MA and TV-14 ratings

In [None]:
df.release_year.value_counts()[:20]

In [None]:
plt.figure(figsize = (10,6))
sns.countplot(x='release_year',order=df['release_year'].value_counts().index[0:20],data=df)
plt.title('Content Release in Years on Netflix vs Count')

# Popular Genres Analysis

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(y='listed_in', order=df['listed_in'].value_counts().index[0:20],data=df)
plt.title('Top 20 Genres on Netflix')

# Summary
So, far we had perform lots of operations over dataset to dig out some very useful information from it.If, we have to conclude the dataset in few line.than we can say that
* More films than TV shows are available on Netflix.
* The United States produces the majority of the films and television shows, followed by India, which has produced the second-largest number of Netflix films.
* In comparison to previous years, Netflix released a lot of content in 2018.
* The most popular genres on Netflix are international films and dramas.