In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly
plotly.offline.init_notebook_mode(connected = True)
from collections import Counter

## Data Loading and Cleaning

In [None]:
show=pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
show.head()

In [None]:
show.isnull().sum()/len(show)*100.0

***'director'*** ,***'cast'*** ,***'country'***,***date_added'*** and ***'rating'*** *contains the* ***Missing Values Completely at Random (MCAR)*** *as there is no relation amongst these variables as well as each of them with other variables.* 

*Since the values in* **'director'** *column are more in number* ***(approx 31%)*** *so they can contain valid information therefore they can't be dropped,
instead we can drop other columns as it will not cause much effect in overall distribution.*

In [None]:
#replacing the missing values which are greater in number with unknown and dropping the less number of values
show['director']=show['director'].fillna('Unknown')
show=show.dropna()

In [None]:
show.isnull().sum()

In [None]:
#date_added is broken down into month,year and day and then dropped along with show_id
show['Added_Year']=show['date_added'].apply(lambda x: x.split(', ')[-1])
show['Added_Month']=show['date_added'].apply(lambda x: x.split(' ')[0])
show['Added_Day']=show['date_added'].apply(lambda x:x.lstrip().split(' ')[1])
show['Added_Day']=show['Added_Day'].apply(lambda x:x.split(',')[0])
show.drop(columns=['show_id','date_added'],inplace=True)

In [None]:
show.head()

In [None]:
show.info()

***'Added_Year'*** *and* ***'Added_Day'*** *are of incorrect format.*

In [None]:
show['Added_Year']=show['Added_Year'].astype(int)
show['Added_Day']=show['Added_Day'].astype(int)

In [None]:
show.info()

## Does netflix contain more TV shows or Movies??

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(10,5))
sns.countplot(x='type',data=show)

*Here it can be seen that netflix has more significant number of movies than TV shows.*

## What is the majority audience in netflix??

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='rating',data=show)

*Most of the content listed on netflix is* ***TV-MA rated i.e. Mature Audience and only suitable for age above 18.***

*Followed by* ***TV-14 rating i.e. Parents Strongly Cautioned and only suitable for age 14 and above***.

*Least content is of* ***G rating i.e. General Audiences and suitable for all ages.***

*as assigned by Motion Picture Association of America(MPAA).*

## Which country produces maximum content?

In [None]:
#selecting top 10 most content producing countries
countries_count_percent=show['country'].value_counts(normalize=True).sort_values(ascending=False)[:10]
c_df=pd.DataFrame(countries_count_percent*100)
c_df

***United States produces approx 34% content in netflix followed by India(approx 13.5%) and UK(approx 5%)*** *amongst the top ten content producing countries on netflix.*

## After release when does the content is added into netflix??

In [None]:
show['show_diff']=abs(show['Added_Year']-show['release_year'])
show_diff_percent=show['show_diff'].value_counts(normalize=True).sort_values(ascending=False)[:10]
show_diff_percent*100

***34% of TV shows or movies are either released on Netflix or they were added months after their release in the same year.***

***Followed by 18% of shows or movies are added on netflix after 1 year of their release.***

## In which year maximum content is released?

In [None]:
plt.figure(figsize=(10,10))
p=sns.light_palette(color='blue',n_colors=16,reverse=True)
sns.countplot(y='release_year',data=show,palette=p,order=show['release_year'].value_counts().index[:15])
plt.ylabel("Year of Release")

*In **2018** maximum content was released outside netflix.*

## In which year maximum content is added on netflix?

In [None]:
plt.figure(figsize=(10,10))
p=sns.light_palette(color='red',n_colors=16,reverse=True)
sns.countplot(y='Added_Year',data=show,palette=p,order=show['Added_Year'].value_counts().index[:15])
plt.ylabel('Year of Addition on Netflix')

*In* ***2019*** *maximum content were added in netflix.*

## Popular Genres in Netflix

In [None]:
genre=show.groupby('listed_in')['type'].count().sort_values(ascending=False).index[:10]
plt.figure(figsize=(10,5))
sns.countplot(y='listed_in',palette='Set3',data=show,order=genre)
plt.ylabel('Genre')

*Majority of the audience watch content related to* ***Drama and International Movies*** *followed by* ***Stand-Up Comedies.***

## Popular Directors 

In [None]:
#Top 20 popular directors
md=[i for i in show.director if i!='Unknown']
director_frequency = Counter(', '.join(md).split(', '))
print("Top 20 directors globally in Netflix:")
director_frequency.most_common(20)

***Raul Campos*** *and* ***Jan Suter*** *has helmed majority of content in netflix amongst other popular directors.*

## Popular Actors

In [None]:
#Top 20 popular actors
actor_frequency = Counter(', '.join(show.cast.dropna()).split(', '))
print("Top 20 most popular actors globally in Netflix:")
actor_frequency.most_common(20)

***Anupam Kher*** *and* ***Shah Rukh Khan*** *are the most frequently appeared actors globally in netflix.*

## MOVIES ANALYSIS

## After release when does the movies are added into netflix??

In [None]:
#creating a separate movie dataset
movies=show[show['type']=='Movie']

In [None]:
movies['show_diff']=abs(movies['Added_Year']-movies['release_year'])
mov_show_diff_percent=movies['show_diff'].value_counts(normalize=True).sort_values(ascending=False)[:10]
mov_show_diff_percent*100

***28% of movies are either released on Netflix or they were added months after their release in the same year.***

***Followed by 19.55% of movies are added on netflix after 1 year of their release.***

In [None]:
#removing 'min' from each values of duration column
movies['duration']=movies['duration'].str.replace(' min',' ').astype(int)

In [None]:
movies.head()

## Do people like to spend long hours watching movies??

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(movies['duration'],bins=20)
plt.xlabel('Duration')
plt.xticks(range(0,300,20))

*People generally watch movies of duration between **80-120 mins**.Most of the audience can't watch a movie for 3 hrs in single sitting.*

## Movies having Longest Running Time

In [None]:
plt.figure(figsize=(10,5))
top=movies.sort_values('duration',ascending=False)
top10=top[:10]
sns.barplot(y='title',x='duration',data=top10)
plt.xlabel('Duration (in mins)')

***Black Mirror: Bandersnatch ,The School of Mischief and No Longer Kids*** *are amongst the movies having longest running time.*

 ## In which year maximum movies were released?

In [None]:
plt.figure(figsize=(10,10))
p=sns.light_palette(color='brown',n_colors=16,reverse=True)
sns.countplot(y='release_year',data=movies,palette=p,order=movies['release_year'].value_counts().index[:15])
plt.ylabel("Year of Release")

*In* ***2017*** *maximum movies were released.*

## In which year maximum movies were added on netflix?

In [None]:
plt.figure(figsize=(10,10))
p=sns.light_palette(color='orange',n_colors=16,reverse=True)
sns.countplot(y='Added_Year',data=movies,palette=p,order=movies['Added_Year'].value_counts().index[:15])
plt.ylabel('Year of Addition on Netflix')

*In* ***2019*** *maximum movies were added on netflix.*

## Top 10 Movie producing countries on netflix

In [None]:
movies_max=movies.groupby('country')['type'].count().sort_values(ascending=False).index[:10]
plt.figure(figsize=(10,5))
sns.countplot(y='country',palette='Set2',data=movies,order=movies_max)

***United States and India*** *produces majority of movies on Netflix.*

## Popular Genres in Movies

In [None]:
mov_genre=movies.groupby('listed_in')['type'].count().sort_values(ascending=False).index[:10]
plt.figure(figsize=(10,5))
sns.countplot(y='listed_in',palette='Dark2',data=movies,order=mov_genre)
plt.ylabel('Genre')

*People like* ***movies which are international,which have drama followed by stand-up comedies.***

## Rating Analysis of Movies

In [None]:
mov_rating=movies.groupby('rating')['type'].count().sort_values(ascending=False).index[:10]
plt.figure(figsize=(10,5))
sns.countplot(y='rating',palette='Paired',data=movies,order=mov_rating)
plt.ylabel('Rating')

*Most of the movies have* ***TV-MA rating which means Mature Audience Only.This program is specifically designed to be viewed by adults and therefore may be unsuitable for children under 17.*** 

*Followed by* **TV-14 rating i.e. Parents Strongly Cautioned and only suitable for age 14 and above.**

*Least Rating is of* ***NR i.e.Not Rated Films.***

*as assigned by Motion Picture Association of America(MPAA).*

## Popular Movie Directors

In [None]:
#Top 20 popular movie directors
md=[i for i in movies.director if i!='Unknown']
mov_director_frequency = Counter(', '.join(md).split(', '))
print("Top 20 directors globally in Netflix:")
mov_director_frequency.most_common(20)

***Raul Campos*** and ***Jan Suter*** *has directed most of the movies.*

## Popular Movie Actors

In [None]:
#Top 20 popular movie actors
ma=[i for i in movies.cast if i!='Unknown']
mov_actor_frequency = Counter(', '.join(ma).split(', '))
print("Top 20 movie actors globally in Netflix:")
mov_actor_frequency.most_common(20)

***Anupam Kher*** and ***Shah Rukh Khan*** *has starred in most of the movies amongst other popular actors.*

## TV SHOWS ANALYSIS

## After release when does Tv shows were added into netflix??

In [None]:
Tv_shows=show[show['type']=='TV Show']

In [None]:
Tv_shows['show_diff']=abs(Tv_shows['Added_Year']-Tv_shows['release_year'])
tv_show_diff_percent=Tv_shows['show_diff'].value_counts(normalize=True).sort_values(ascending=False)[:10]
tv_show_diff_percent*100

***49.7% of TV shows are either released on Netflix or they were added months after their release in the same year.***
Maybe some amount of them can be netflix originals.

***Followed by 17% of shows or movies are added on netflix after 1 year of their release.***

## For how many seasons does TV shows extend most of the time?

In [None]:
Tv_shows['duration']=Tv_shows['duration'].apply(lambda x:x.split(' Seasons')[0])
Tv_shows['duration']=Tv_shows['duration'].apply(lambda x:x.split(' Season')[0])
Tv_shows['duration']=Tv_shows['duration'].astype(int)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(Tv_shows['duration'],bins=5)
plt.xlabel('Seasons')

*Most of the TV shows on netflix extend upto* ***2 to 4 seasons.*** 

## TV shows having the highest number of seasons

In [None]:
plt.figure(figsize=(10,5))
top=Tv_shows.sort_values('duration',ascending=False)
top10=top[:10]
sns.barplot(y='title',x='duration',data=top10)
plt.xlabel('Seasons')

***NCIS, Grey's Anatomy and Supernatural*** *are amongst the TV shows that have highest number of seasons.*

## In which year maximum TV shows were released?

In [None]:
plt.figure(figsize=(10,10))
p=sns.light_palette(color='green',n_colors=16,reverse=True)
sns.countplot(y='release_year',data=Tv_shows,palette=p,order=Tv_shows['release_year'].value_counts().index[:15])
plt.ylabel("Year of Release")

*In* ***2020*** *maximum TV shows were released.*

## In which year maximum TV shows were added on netflix?

In [None]:
plt.figure(figsize=(10,10))
p=sns.light_palette(color='violet',n_colors=16,reverse=True)
sns.countplot(y='Added_Year',data=Tv_shows,palette=p,order=Tv_shows['Added_Year'].value_counts().index[:15])
plt.ylabel('Year of Addition on Netflix')

*In* ***2020*** *maximum TV shows were added on netflix.*

## Top 10 TV shows producing countries on netflix

In [None]:
tv_max=Tv_shows.groupby('country')['type'].count().sort_values(ascending=False).index[:10]
plt.figure(figsize=(10,5))
sns.countplot(y='country',palette='Set1',data=Tv_shows,order=tv_max)

***United States and United Kingdom*** *produces majority of TV shows on netflix.*

## Popular Genres in TV shows

In [None]:
tv_genre=Tv_shows.groupby('listed_in')['type'].count().sort_values(ascending=False).index[:10]
plt.figure(figsize=(10,5))
sns.countplot(y='listed_in',palette='Accent',data=Tv_shows,order=tv_genre)
plt.ylabel('Genre')

*It is seen that* ***Kids'TV is the most popular genre followed by Crime TV,Interational TV shows and TV Dramas***.

## Rating analysis of TV shows

In [None]:
tv_rating=Tv_shows.groupby('rating')['type'].count().sort_values(ascending=False).index[:10]
plt.figure(figsize=(10,5))
sns.countplot(y='rating',palette='Pastel1',data=Tv_shows,order=tv_rating)
plt.ylabel('Rating')

*Most of the TV-shows have* ***TV-MA rating which means Mature Audience Only.This program is specifically designed to be viewed by adults and therefore may be unsuitable for children under 17.*** 

*Followed by* **TV-14 rating i.e. Parents Strongly Cautioned and only suitable for age 14 and above.**

*Least Rating is of* ***NR i.e.Not Rated Films.***

*as assigned by Motion Picture Association of America(MPAA).*

## Popular Directors of TV shows

In [None]:
#Top 20 popular tv shows directors
md=[i for i in Tv_shows.director if i!='Unknown']
tv_director_frequency = Counter(', '.join(md).split(', '))
print("Top 20 TV show directors globally in Netflix:")
tv_director_frequency.most_common(20)

***Alastair Fothergill*** *has directed more TV shows.*

## Popular TV shows actors

In [None]:
#Top 20 popular tv actors
md=[i for i in Tv_shows.cast if i!='Unknown']
tv_actor_frequency = Counter(', '.join(md).split(', '))
print("Top 20 TV show actors globally in Netflix:")
tv_actor_frequency.most_common(20)

***Takahiro Sakurai*** *has acted in most TV shows amongst other popular actors.*

## Contribution of Countries Over Time

In [None]:
#creating a dummy dataframe
show_copy=show.copy(deep=True)

In [None]:
#creating an added date column
month={'January':'1','February':'2','March':'3','April':'4','May':'5','June':'6','July':'7','August':'8','September':'9',
       'October':'10','November':'11','December':'12'}
show_copy['Month']=show['Added_Month'].map(month)
show_copy.rename(columns={'Added_Year':'Year','Added_Day':'Day'},inplace=True)
show_copy['Added_Date'] = pd.to_datetime(show_copy[['Year', 'Month', 'Day']],format='%d%m%Y')

In [None]:
#dropping irrelevant columns
show_copy.drop(columns=['Year','Month','Day','show_diff'],inplace=True)

In [None]:
#generating top 20 countries by content using counter
md=[i for i in show_copy.country if i!='Unknown']
country_frequency = Counter(', '.join(md).split(', '))
countries=[]
for i ,j in country_frequency.most_common(20):
    countries.append(show_copy[show_copy['country']==i])

In [None]:
#converting the resultant counter object into dataframe
merged=pd.concat(countries)
merged=merged.dropna().reset_index().sort_values('Added_Date')
merged.drop(columns='index',axis=1,inplace=True)

In [None]:
#grouping countries by date of content added 
show_t=merged.groupby(['country']).count()['Added_Date'].sort_values(ascending=False)

In [None]:
#discovering trend using dropdown
buttons=[]

default_country='United States'

fig=go.Figure()

for i in show_t.index:
    df=merged[merged.country==i]
    fig.add_trace(go.Scatter(x=df['Added_Date'],name=i,visible=(i==default_country),mode='lines+markers',
                            hovertemplate="Date: %{x}<br>Number of Shows Added: %{y}"))
for country in show_t.index:
    buttons.append(dict(method='restyle',
                            label=country,
                            args = [{'visible': [country == r for r in show_t.index]}]))

# Add dropdown menus to the figure
fig.update_layout(title_text='Trend of Producing Content By Each Country',showlegend=False, 
                  updatemenus=[dict(buttons=buttons,direction='down', 
                                showactive=True)])

*This plot shows how the countries has produced content over time in netflix. Don't forget to use the drop down menu for other insights.*

In [None]:
buttons2=[]

default_type='Movie'

fig=go.Figure()

for i in show_t.index:
    df=merged[merged.country==i]
    fig.add_trace(go.Scatter(x=df['Added_Date'],name=i,mode='lines+markers',
                            hovertemplate="Date: %{x}<br>Number of Shows Added: %{y}"))

for t in merged.type.unique():
    buttons2.append(dict(method='restyle',
                            label=t,
                            args = [{'visible': [t == r for r in merged.type.unique()]}]))
    
# Add dropdown menus to the figure
fig.update_layout(title_text='Trend of Producing Content By Countries in Each Category',showlegend=False, 
                  updatemenus=[dict(buttons=buttons2,direction='down', 
                                showactive=True)])

## Choropleth Representation of different regions of world

In [None]:
# get sorted lists of country frequency and country names by country frequency
country_frequencies, country_names = zip(*sorted(zip(country_frequency.values(), 
                                                     country_frequency.keys()), 
                                                 reverse = True))
country_frequency_df = pd.DataFrame({"country": country_names, 
                                     "frequency": country_frequencies})

In [None]:
country_frequency_df.head()

*There are no iso3 or iso2 codes of countries to plot them on choropleth.Therefore,without including them from an another dataset we can make use of a library known as 'country_converter' to find all relevant details about any country.*

*More details at-https://pypi.org/project/country-converter/*

In [None]:
pip install country_converter

In [None]:
#converting the country column into list
s=country_frequency_df.country.to_list()

In [None]:
#generating iso3 names of the countries
import country_converter as cc
iso_alpha=cc.convert(names=s,to='ISO3')

In [None]:
#putting it in dataframe
country_frequency_df['iso_codes']=iso_alpha

In [None]:
fig_1=px.choropleth(country_frequency_df,hover_name='country',scope='world',
              labels={'iso_codes':'ISO_CODE','frequency':'Content Produced'},
               title='Netflix Content Frequency Distribution around the world',locations='iso_codes',color='frequency',
             color_continuous_scale=px.colors.sequential.Purpor)
fig_1.show()

*As seen in the map,* ***USA(2913),India(955),UK(632),Canada(382),France(319)*** *has produced most number of contents in netflix*.

In [None]:
fig_2=px.choropleth(country_frequency_df,hover_name='country',scope='europe',
              labels={'iso_codes':'ISO_CODE','frequency':'Content Produced'},
               title='Netflix Content Frequency Distribution around Europe',locations='iso_codes',color='frequency',
             color_continuous_scale=px.colors.sequential.PuRd)
fig_2.show()

*In **Europe Region,United Kingdom(632)** has produced maximum content.*

In [None]:
fig_3=px.choropleth(country_frequency_df,hover_name='country',scope='asia',
              labels={'iso_codes':'ISO_CODE','frequency':'Content Produced'},
               title='Netflix Content Frequency Distribution around Asia',locations='iso_codes',color='frequency',
             color_continuous_scale=px.colors.sequential.PuRd)
fig_3.show()

*In **Asia Region,India(955**) has produced maximum content.*

In [None]:
fig_4=px.choropleth(country_frequency_df,hover_name='country',scope='north america',
              labels={'iso_codes':'ISO_CODE','frequency':'Content Produced'},
               title='Netflix Content Frequency Distribution around North America',locations='iso_codes',color='frequency',
             color_continuous_scale=px.colors.sequential.PuRd)
fig_4.show()

*In North **America Region,USA(2913)** produces maximum content.*

In [None]:
fig_5=px.choropleth(country_frequency_df,hover_name='country',scope='south america',
              labels={'iso_codes':'ISO_CODE','frequency':'Content Produced'},
               title='Netflix Content Frequency Distribution around South America',locations='iso_codes',color='frequency',
             color_continuous_scale=px.colors.sequential.PuRd)
fig_5.show()

*In **South America Region,Brazil(77) and Argentina(74)** has produced most of the content.*

In [None]:
fig_6=px.choropleth(country_frequency_df,hover_name='country',scope='africa',
              labels={'iso_codes':'ISO_CODE','frequency':'Content Produced'},
               title='Netflix Content Frequency Distribution around Africa',locations='iso_codes',color='frequency',
             color_continuous_scale=px.colors.sequential.PuRd)
fig_6.show()

*In **Africa Region,Egypt(109)** has produced maximum content.*


**UPVOTE AND SHARE IF YOU LIKE IT**

**DO PROVIDE YOUR VALUABLE FEEDBACK!!!**

**THANK YOU.**