# Exploratory Data Analysis of Netflix Titles

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

sns.set_theme(style="darkgrid")

### Importing Dataset

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
df.head()

### Data Overview

In [None]:
df.info()

In [None]:
df.isna().sum()

### Missing values consist of :
- "director" : I will be dropping this column since its not of much practical use for this analysis.
- "cast" : Similarly for cast, I will be dropping this column.
- "country" : Since this is an important variable for the analysis,  I am going to take the mode of available countries and assign it to the missing values.
- "date_added" : Similar treatment as the country column.
- "rating" : There are only 7 missing cases which can be easily found on the net so I will replace them manually.

### Data Cleaning and Manipulation

In [None]:
df.drop(['cast' , 'director'] , axis = 1 , inplace = True) 

In [None]:
df['rating'].unique()

In [None]:
df[df['rating'].isna()]

In [None]:
rating_replacements = {
    67: 'TV-PG',
    2359: 'TV-14',
    3660: 'TV-MA',
    3736: 'TV-MA',
    3737: 'NR',
    3738: 'TV-MA',
    4323: 'TV-MA '
}

for id, rate in rating_replacements.items():
    df.iloc[id, 6] = rate
    
df['rating'].isna().sum()

In [None]:
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['country_main'] = df['country'].apply(lambda x : x.split(",")[0])
df['country_main'].head()

There are many entries in the dataset in which the 'country' column consists of more than one value. We added a new column namely 'country_main' in which we consider only one country to make our analyis easier.

In [None]:
df['year_added'] = df['date_added'].apply(lambda x : x.split(" ")[-1])
df['year_added'].head()

In [None]:
df.date_added = df.date_added.str.lstrip()
df['month_added'] = df['date_added'].apply(lambda x : x.split(" ")[0])
df['month_added'].head()

In [None]:
ratings_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-MA ': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}
df['target_age'] = df['rating'].replace(ratings_ages)

In [None]:
df.info()

In [None]:
df.head()

### Data Visualization

In [None]:
dt = df.type.value_counts().reset_index()
plt.figure(figsize = (8,8))
plt.pie(df.type.value_counts() , labels = dt['index'] , autopct = '%.2f%%' , colors = ['lightblue' , 'lightgreen'])
plt.title('Distribution of Netflix content' , fontsize = 16)
plt.show()


Nearly 31% of content on Netflix consists of TV shows

In [None]:
data_1 = df['country_main'].value_counts().reset_index().head(20)
data_1.columns = ['Country' , 'Count']

plt.figure(figsize = (11,11))
sns.barplot(x = 'Count' , y = 'Country' , data = data_1)
plt.title('Top 20 countries with the most Netflix content' , fontdict={ 'fontweight' : 'bold', 'fontsize' : 18 })
plt.show()

United States leads the pack with a huge margin with more than 3000+ titles to its name. USA is followed by India in the second place which is quite a feat since Netflix only came to India in January 2016. In a matter of only 5 years India has managed to rise through the ranks to be the second highest content creator. 

In [None]:
country_df = df['country_main'].value_counts().reset_index()
country_df = country_df[country_df['country_main'] /  country_df['country_main'].sum() > 0.01]

fig = px.pie(country_df, values='country_main', names='index')
fig.update_traces(textposition='inside', textinfo='percent+label')

The pie chart above clearly shows the dominance of USA in the amount of Netflix Titles produced. The numbers for USA are a little boosted since we used mode to fill the missing values but nevertheless United States is still ahead by a huge margin.

In [None]:
df.replace("TV-MA " , "TV-MA" , inplace = True) # a single entry had an extra whitespace

In [None]:
np.unique(df.rating)

In [None]:
movies_df = df[df['type'] == 'Movie']
tv_df = df[df['type'] == 'TV Show']

Creating two different dataframes on the basis of type of content i.e. Movies and TV Shows.

In [None]:
movies_rating = movies_df.groupby(['rating'])['show_id'].count().reset_index().sort_values('show_id' , ascending = False)
movies_rating.columns = ['rating' , 'count']
movies_rating

In [None]:
plt.figure(figsize = (15,7))
sns.pointplot(x = 'rating', y = 'count' , data = movies_rating)
plt.title('Movie ratings distribution' , size = 16)
plt.show()

In [None]:
age_df = movies_df['target_age'].value_counts().reset_index()
age_df.columns = ['Target Audience' , 'Count']

plt.figure(figsize = (8,8))
sns.barplot(x = 'Target Audience' , y = 'Count' , data = age_df)
plt.title('Target Audience for Movies' , size = 16)
plt.show()

In [None]:
labels = ['Adults' , 'Teen' , 'Older Kids' , 'Kids']
plt.figure(figsize = (8,8))
plt.pie(movies_df['target_age'].value_counts() ,labels = labels ,autopct = '%.2f%%' , 
        colors = ['#4287f5','#f08330','#67b013','#d93434'])
plt.title('Target Audience for Movies' , size = 16)
plt.show()

Almost 50% of the movies target adult audiences

In [None]:
tv_rating = tv_df.groupby(['rating'])['show_id'].count().reset_index().sort_values('show_id' , ascending = False)
tv_rating.columns = ['rating' , 'count']
tv_rating

In [None]:
plt.figure(figsize = (15,7))
sns.pointplot(x = 'rating', y = 'count' , data = tv_rating)
plt.title('TV Shows rating distribution' , size = 16)
plt.show()

In [None]:
age_df = tv_df['target_age'].value_counts().reset_index()
age_df.columns = ['Target Audience' , 'Count']

plt.figure(figsize = (9,9))
sns.barplot(x = 'Target Audience' , y = 'Count' , data = age_df )
plt.title('Target Audience for TV Shows' , size = 16)
plt.show()

In [None]:
labels = ['Adults' , 'Teen' , 'Older Kids' , 'Kids']
plt.figure(figsize = (8,8))
plt.pie(tv_df['target_age'].value_counts() ,labels = labels ,autopct = '%.2f%%',
       colors = ['#4287f5','#f08330','#67b013','#d93434'])
plt.title('Target Audience for TV Shows' , size = 16)
plt.show()

We can observe that in case of TV shows Kids and Older Kids have a higher percentage when compared to movies. 

In [None]:
df_content = df.groupby(['year_added'])['show_id'].count().reset_index().rename(columns = 
                {'index' : 'year added' , 'show_id' : 'Count'})

plt.figure(figsize = (12,8))
plt.plot(df_content.year_added, df_content.Count, 'b.-' , linewidth = 2)
plt.title('Growth of content added on Netflix over the years 2008 to 2021' , size = 16)
plt.xlabel('Year')
plt.ylabel('Number of Movies and TV shows added')
plt.show()

Netflix has constantly seen a consistent growth in amount of titles being released per year since 2014. We can observe that the number of titles skyrocketed after 2015. The year 2019 and 2020 saw more than 2000+ titles being produced. A slight dip is observed from 2019 to 2020 because of Covid-19 restrictions. During the pandemic, there was definitely an increase in the number of paid subscribers. But the entertainment industry took a hit due to shooting restrictions. Also, this dataset contains titles only upto January 2021. That explains the dip for the year 2021.

In [None]:
movies_content = movies_df.groupby(['year_added'])['show_id'].count().reset_index().rename(columns = 
                {'index' : 'year added' , 'show_id' : 'Count'})
tv_content = tv_df.groupby(['year_added'])['show_id'].count().reset_index().rename(columns = 
                {'index' : 'year added' , 'show_id' : 'Count'})

plt.figure(figsize = (12,8))

plt.plot(df_content.year_added, df_content.Count, marker = '.' , linewidth = 2, label = 'Overall' , color = 'yellow')
plt.plot(tv_content.year_added, tv_content.Count, marker = '.' , linewidth = 2 , label = 'TV Shows' , color = 'red' )
plt.plot(movies_content.year_added, movies_content.Count, marker = '.' , linewidth = 2 , label = 'Movies', color = 'blue')
plt.title('Growth of content added on Netflix over the years 2008 to 2021' , size = 16)
plt.xlabel('Year')
plt.ylabel('Number of Movies and TV shows added')

plt.legend()
plt.show()



In [None]:
months = df['month_added'].value_counts().reset_index()

fig = px.pie(months, values='month_added', names='index')
fig.update_traces(textposition='inside', textinfo='percent+label')


Most content is added in the month of December. Since it is holiday season in December, it makes sense that December leads in terms of content added.

In [None]:
movies_df['minutes'] = movies_df['duration'].apply(lambda x : x.split(" ")[0])

plt.figure(figsize = (10,8))
sns.distplot(movies_df['minutes'], color = 'red')
plt.title('Distribution Of Running time of movies on Netflix', size = 16)
plt.xlabel('Duration')
plt.show()

It is evident that majority of the movies have duration ranging from 85 min to 110 min.



In [None]:
data = tv_df.groupby(['duration'])['show_id'].count().reset_index().rename(columns = 
        {'index' : 'Duration' , 'show_id' : 'Count'}).sort_values(['Count'] , ascending = False)
plt.figure(figsize = (12,8))
sns.barplot(x = 'duration' , y = 'Count' , data = data)
plt.title('Distribution of number of seasons of TV Shows on Netflix' , size = 16)
plt.xticks(rotation = 60)
plt.show()


Majority of shows only go upto 6-7 seasons with only a tiny amount going for more than 8 seasons.

In [None]:
data = tv_df['duration'].value_counts().reset_index()
data = data[data['duration'] /  data['duration'].sum() > 0.005]

fig = px.pie(data, values='duration', names='index')
fig.update_traces(textposition='inside', textinfo='percent+label')


This pie chart further emphasizes on the fact that almost all TV shows run for less than 8 seasons.

### A deeper look into India's performance over the previous years

Netflix India's performance looks quite impressive since its introduction in January 2016. Lets take a deeper dive into this. 

In [None]:
india_df = df[df['country'] == 'India']
india_df.head()

In [None]:
data = india_df.type.value_counts().reset_index()
plt.figure(figsize = (8,8))
plt.pie(india_df.type.value_counts() , labels = data['index'] , autopct = '%.2f%%' , colors = ['lightblue' , 'lightgreen'])
plt.title('Distribution of Netflix content produced in India' , fontsize = 16)
plt.show()

92.31% of the titles produced are movies which is much higher than the overall percentage which was around 69%.

In [None]:
labels = ['Adults' , 'Teen' , 'Older Kids' , 'Kids']
plt.figure(figsize = (8,8))
plt.pie(india_df['target_age'].value_counts() ,labels = labels ,autopct = '%.2f%%',
       colors = ['#4287f5','#f08330','#67b013','#d93434'])
plt.title('Target Audience for Netflix content produced in India' , size = 16)
plt.show()

Almost 57% of the content produced in India is targeted towards Adults.

In [None]:
india_content = india_df.groupby(['year_added'])['show_id'].count().reset_index()
india_content.columns = ['Year' , 'Count']

plt.figure(figsize = (10,8))
plt.plot(india_content.Year , india_content.Count , marker = '.' , color = 'blue')
plt.title('Number of Netflix Titles released every year since its introduction in 2016' , size = 16)
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

This exponential rise from 2017 to 2018 is quite impressive. The number of titles saw an increase of more than two times. We saw a dip from 2019 to 2020 because of covid.

In [None]:
india_movies_df = india_df[india_df['type'] == 'Movie']
india_movies_df['minutes'] = india_movies_df['duration'].apply(lambda x: x.split(" ")[0])

plt.figure(figsize = (12,8))
sns.distplot(india_movies_df['minutes'] , color = 'red')
plt.title('Distribution Of Running time of movies on Netflix(India)' , size =16)
plt.show()


The average running time is 110-140 minutes. We can observe that the running time of Indian movies is much higher than the overall average running time which is around 85-120 min. As we all know Bollywood movies are generally much longer than Hollywood movies. 

In [None]:
months = india_df['month_added'].value_counts().reset_index()

fig = px.pie(months, values='month_added', names='index')
fig.update_traces(textposition='inside', textinfo='percent+label')


Here also we can observe a similar trend as above. December is the most preferred month for adding new content.