# We import the data and observe data types and look for any missing values. We also import all the relevant libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as mlb
import missingno as msno

In [None]:
df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
mlb.figure(figsize=(16,8))
sns.heatmap(df.isnull(), cbar=True, cmap='magma')

In [None]:
msno.bar(df, color='red')

# Director Cast and Country have missing values. We will deal with each column to clean the data

In [None]:
df['director'] = df['director'].fillna('Unknown')

In [None]:
df['cast'] =df['cast'].fillna('Unknown')

In [None]:
df['country'] = df['country'].fillna('Unknown')

In [None]:
import datetime

In [None]:
df['date_added'] = df['date_added'].fillna(datetime.date.today())

# No logic could be used so just filled those columns with 'Unknown' instead of dropping them.

In [None]:
msno.bar(df, color='blue')

# Now we will analyze data 

#  In this first visual we will group the data by release year and see how many shows were released in which year on Netflix. 

* To reduce the complexities, I have only taken into account top 15 release years.

In [None]:
RL = df.groupby('release_year').count().reset_index().sort_values(ascending=False, by='show_id')[0:15]

In [None]:
RL

In [None]:
mlb.figure(figsize=(13,8))
sns.barplot(x='release_year',y='show_id', data=RL, order=RL['release_year'], palette='BuGn_r')
sns.despine(left=True)

# Then we have analyzed what type of content is available and in what quantity on Netflix

In [None]:
mlb.figure(figsize=(10,6))
sns.countplot(x='type', data=df)
sns.despine(left=True)
mlb.xlabel('Type of Shows')
mlb.ylabel('Count of Shows')
mlb.title('Total Show count according to type')

# Here we analyze what type of ratings was released in what year. The box plots tell more than just the average release year. 

In [None]:
mlb.figure(figsize=(13,8))
sns.boxplot(y='release_year', x='rating', data=df, palette='BuGn')
sns.despine(left=True)
mlb.ylim(1990,2020)

# We analysed the data as to when were the highest number of shows added to netflix

In [None]:
DA = df.groupby('date_added').count().reset_index().sort_values(ascending=False, by='type')[0:25]

In [None]:
mlb.figure(figsize=(14,10))
g = sns.barplot(x='date_added', data=DA, y='type', palette='BuGn_r')
sns.despine(left=True)
mlb.ylabel('Count of Shows')
mlb.title('Count of Shows by Date Added')
mlb.xticks(rotation=60)
#g.set_xticklabels(g.get_xticklabels(), rotation=60)

# Movies and TV Shows Box plot with magnifying view. It sees tv shows have a more recent release_year. This means tv shows are releasing more in recent years.

In [None]:
mlb.figure(figsize=(12,8))
sns.boxplot(x='type', y='release_year', data=df, )
sns.despine(left=True)
mlb.title('Type of Show by Release Date')
mlb.ylim(2000,2020)

# Now we look into the highest count of movies according to their durations

In [None]:
df['duration'] = df['duration'].apply(lambda x: x.split(' ')).str[0]

In [None]:
df['duration'] = df['duration'].astype(float)

In [None]:
df_movie = df[df['type'] == 'Movie']
df_tvshows = df[df['type'] == 'TV Show']

In [None]:
df_md = df_movie.groupby('duration').count().reset_index().sort_values(ascending=False,by='type')[0:25]
df_md
mlb.figure(figsize=(16,8))
sns.barplot(data = df_md, x='duration', y='show_id', order=df_md['duration'], palette='RdBu')
sns.despine(left=True)
mlb.ylabel('Count of Movies')
mlb.xlabel('Duration of Movies (In Mins)')
mlb.title('Count of Movies by Duration')

# Same for TV Shows.. Count for TV Show with number of Seasons

In [None]:
df_td = df_tvshows.groupby('duration').count().reset_index().sort_values(ascending=False,by='type')[0:15]
df_td
mlb.figure(figsize=(12,8))
sns.barplot(data = df_td, x='duration', y='show_id', order=df_td['duration'], palette='RdBu')
sns.despine(left=True)
mlb.ylabel('Count of TV Shows')
mlb.xlabel('Number of Seasons')
mlb.title('Count of Tv Shows by Number of Seasons')

# Comparing to see if the release year has any impact on the duration over the years

In [None]:
df_mr = df[df['type'] == 'Movie'].groupby('release_year').count().reset_index().sort_values(ascending=False, by='type')
df_tr = df[df['type'] == 'TV Show'].groupby('release_year').count().reset_index().sort_values(ascending=False, by='type')

In [None]:
fig, ax =mlb.subplots(nrows= 1, ncols = 2, figsize=(12,6))
#mlb.figure(figsize=(8,6))
g = sns.scatterplot(x='release_year', y='duration', data=df_mr, ax=ax[0], color= 'green')
s = sns.scatterplot(x='release_year', y='duration', data=df_tr, ax=ax[1], color='red')
sns.despine(left=True)

g.set_ylabel('Duration in Minutes')
s.set_ylabel('Duration in Number of Seasons')

g.set_title('For Movies')
s.set_title('For TV Shows')

# top 15 countries with highest number of content over the years. 

In [None]:
top_15 = df.groupby('country').count().reset_index().sort_values(ascending=False,by='show_id')[0:15]['country']

In [None]:
top_15_df = df[df['country'].isin(top_15)]

In [None]:
sns.set_style('darkgrid')
mlb.figure(figsize=(14,8))
sns.stripplot(y='country', x='release_year', data=top_15_df, palette='RdBu')
sns.despine(left=True)
mlb.xticks(rotation=60)
mlb.xlim(1970,2020)

<img src="https://media.giphy.com/media/XbxZ41fWLeRECPsGIJ/giphy.gif">

# If you liked the visuals please like the notebook and leave a comment.. also follow me on Kaggle for more content!