In [None]:
import numpy as np 
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
path = '/kaggle/input/netflix-shows/netflix_titles.csv'
df_netflix = pd.read_csv(path)
df_netflix.head()

In [None]:
df_netflix.info()

In [None]:
print('Number of columns:', df_netflix.shape[1])
print('Number of rows:', df_netflix.shape[0])

#### Number of missing values by column

In [None]:
missing_count = df_netflix.isnull().sum()
missing_count

#### Number of duplicated rows

In [None]:
df_netflix.duplicated().sum()

<h1><center>Data Cleansing</center></h1> 

#### Proportion of missing values in each fields

In [None]:
total_null = missing_count.sort_values(ascending=False)
perc = (missing_count/df_netflix.isnull().count()*100).sort_values(ascending=False)
total = pd.concat([total_null, perc], axis=1, keys = ['Total null values', 'Percentage of null values'])
total.T.style.bar()

Based on over 30% missing values count, removing all rows containing emty data seems like an unappropriate way to handle them. However, all columns in which contain the missing values are holding categorical data, so I will replace all NA's with a variable to differentiate them.

In [None]:
#getting column names
col = df_netflix.columns
col

In [None]:
#Filling in missing values
value = 'Not Recorded'
for i in col:
    df_netflix[i].fillna(value, inplace=True)
    
df_netflix.isnull().sum()

In [None]:
df_netflix.head()

<h1><center>Exploratory Analysis</center></h1>

## Content Types

#### Distribution of content Types

In [None]:
fig = px.histogram(df_netflix['type'])
fig.update_layout(title='Distribution of content types',title_x=0.5)

It appears that Netflix's movies outweighted its TV shows in terms of number of productions.

#### Content types' trend

In [None]:
pd.crosstab(index=df_netflix['type'], columns=df_netflix['release_year'], margins=True).style.background_gradient(cmap='Greens')

Now that Netflix's number of TV shows are having a tendency to surpass its Movies in the two recent years.

#### Total content production over year

In [None]:
fig = px.histogram(df_netflix['release_year'],nbins=30,labels={'value':'Released Year'})
fig.update_layout(title='Content Production Over Time',title_x=0.5)

## Directors

#### Top 20 director of movies

In [None]:
df_netflix_movies = df_netflix[df_netflix['type'] == 'Movie']
director_movies = pd.DataFrame(df_netflix_movies['director'].value_counts()).reset_index().head(21)

fig = px.bar(director_movies,director_movies['index'],director_movies['director'],labels={'index':'Directors','splitted':'Frequency'})
fig.update_layout(title='Top 20 Movie Directors',title_x=0.5)

163 movies were not having any directors recorded, while there were 2389 empty values in total of the Director field. This mean that there is a huge sum of missing values in TV shows' director. Therefore, I will exlude those values out of the next visualization to avoid any biases.

#### Top 20 director of TV Shows

In [None]:
df_netflix_shows = df_netflix[df_netflix['type'] == 'TV Show']
director_shows_recorded = df_netflix_shows[df_netflix_shows['director'] != 'Not Recorded']
director_shows = pd.DataFrame(director_shows_recorded['director'].value_counts()).reset_index().head(20)

fig = px.bar(director_shows,director_shows['index'],director_shows['director'],labels={'index':'Directors','director':'Frequency'})
fig.update_layout(title='Top 20 TV Show Directors',title_x=0.5)

## Cast

In [None]:
df_cast_recorded = df_netflix[df_netflix['cast'] != 'Not Recorded']
df_cast = df_cast_recorded.assign(var1 = df_cast_recorded.cast.str.split(',')).explode('var1').reset_index(drop = True)

df_cast['splitted'] = df_cast.var1.str.lstrip()
df_cast.head()

In [None]:
cast = pd.DataFrame(df_cast['splitted'].value_counts()).reset_index()
cast_sorted = cast.sort_values('splitted',ascending=False)[:20][::-1]

fig = px.bar(cast_sorted,cast_sorted['splitted'],cast_sorted['index'],labels={'index':'Cast','splitted':'Frequency'})
fig.update_layout(title='Top 20 Movie/TV Show Actors',title_x=0.5)


## Country

In [None]:
#splitting countries that work together on a project
df_country_recorded = df_netflix[df_netflix['country'] != 'Not Recorded']
df_country = df_country_recorded.assign(var1 = df_country_recorded.country.str.split(',')).explode('var1').reset_index(drop = True)

df_country['splitted'] = df_country.var1.str.lstrip()
df_country.head()

In [None]:
country = pd.DataFrame(df_country['splitted'].value_counts()).reset_index()
country_sorted = country.sort_values('splitted',ascending=False)[:20][::-1]

fig = go.Figure(data=[go.Scatter(
    x=country_sorted['index'],
    y=country_sorted['splitted'],
    mode='markers',
    marker=dict(
        color=4000+np.random.randn(200),

        size=country_sorted['splitted']*0.1,
        showscale=True
        )
)])
fig.update_layout(
    title='Top 20 Most Recurred Countries',
    title_x=0.5,
    xaxis_title="Country",
    yaxis_title="Frequency",
        template='plotly_white'

)
fig.show()

## Age Ratings

In [None]:
ratings = pd.DataFrame(df_netflix['rating'].value_counts()).reset_index()

fig = px.treemap(ratings, path=['index'], values=ratings['rating'], height=700,
                 title='Age Ratings Distribution', color_discrete_sequence = px.colors.qualitative.Dark2)
fig.data[0].textinfo = 'label+text+value'
fig.update_layout(title_x=0.5)
fig.show()


## Duration

#### Duration of movies

In [None]:
df_netflix_movies = df_netflix[df_netflix['type'] == 'Movie']
fig = px.box(df_netflix_movies,y='duration')
fig.update_layout(title='Box Plot of content duration (Movie)',title_x=0.5)

In [None]:
duration_movies = pd.DataFrame(df_netflix_movies['duration'].value_counts()).reset_index()
fig = px.bar(duration_movies,duration_movies['index'],duration_movies['duration'],labels={'index':'Duration','duration':'Frequency'})
fig.update_layout(title='Movie duration distribution',title_x=0.5)

#### Duration of TV Shows

In [None]:
df_netflix_shows = df_netflix[df_netflix['type'] == 'TV Show']
fig = px.box(df_netflix_shows,y='duration')
fig.update_layout(title='Box Plot of content duration (TV Show)',title_x=0.5)

In [None]:
duration_shows = pd.DataFrame(df_netflix_shows['duration'].value_counts()).reset_index()
fig = px.bar(duration_shows,duration_shows['index'],duration_shows['duration'],labels={'index':'Duration','duration':'Frequency'})
fig.update_layout(title='TV Show duration distribution',title_x=0.5)

## Genres

In [None]:
df_genres = df_netflix.assign(var1 = df_netflix.listed_in.str.split(',')).explode('var1').reset_index(drop = True)

df_genres['splitted'] = df_genres.var1.str.lstrip()
df_genres.head()

In [None]:
genres = pd.DataFrame(df_genres['splitted'].value_counts()).reset_index()
genres_sorted = genres.sort_values('splitted',ascending=False)[::-1]

fig = px.bar(genres_sorted,genres_sorted['splitted'],genres_sorted['index'],labels={'index':'Genres','splitted':'Frequency'})
fig.update_layout(title='Top Genres',title_x=0.5)