# Netflix Movies and TV Shows - Exploratory Data Analysis

![](https://help.nflxext.com/43e0db2f-fea0-4308-bfb9-09f2a88f6ee4_what_is_netflix_1_en.png)

#### **Netflix** is a subscription-based streaming service that allows our members to watch TV shows and movies without commercials on an internet-connected device.  

## Let's import some libraries

In [None]:
#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly as py
import os
from wordcloud import WordCloud
from PIL import Image
import matplotlib
import matplotlib.colors
py.offline.init_notebook_mode(connected = True)
import datetime as dt
plt.rcParams['figure.dpi'] = 130

## ****Loading and Reading the dataset****

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv') #loading the dataset
df.head(5)

## **Cleaning and Pre Processing**

In [None]:
df.info()

In [None]:
#Get the percentage of missing values from each column
for i in df.columns:
    null_percentage = df[i].isna().sum() / len(df) * 100
    if(null_percentage>0) :
        print("{} null percentage: {}%".format(i,round(null_percentage,2)))

Since we can observe that "director" column has highest number of the missing data , I will drop them.

In [None]:
df.drop(['director'],axis = 1,inplace = True) # Remove the "director" column
df.info()

* Since all the movies and tv shows are aired on Netflix US, we can replace all the missing values with the US.
* And replace the missing cast values would be replaced by None, as the cast column is useful

In [None]:
df['country'].replace(np.nan, 'United States',inplace  = True) # Replacing all the null values with US
df['cast'].replace(np.nan,'Not Avail',inplace = True) # Replacing all the null values with Not Avail
df.info()

In [None]:
df.dropna(inplace=True) #Removing the remaining null values from rating and date_added
df.isnull().sum()

- Since the date_added is of type object we need to convert to date time.

In [None]:
print(df['date_added'])
df["date_added"] = pd.to_datetime(df['date_added'])

In [None]:
df['month_added']=df['date_added'].dt.month 
df['month_name_added']=df['date_added'].dt.month_name()
df['year_added'] = df['date_added'].dt.year
df.drop('date_added',axis=1,inplace=True)

* Lastly, we would replace all the rating with more meaningful values

In [None]:
df['rating'].unique()

In [None]:
# Reference from this notebook: https://www.kaggle.com/andreshg/eda-beginner-to-expert-plotly
target_audience = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}
df['target_audience'] = df['rating'].replace(target_audience)
df.drop('rating',inplace = True,axis = 1)
df.head()

# **Analysis** and Visualisation

In [None]:
#Palette
netflix_palette=sns.palplot(['#db0000','#000000','#564d4d','#831010']) #https://www.color-hex.com/color-palette/22942
plt.title('Netflix Brand Color Palette',loc="left",fontfamily="Arial",fontsize=16,y=1.1)
plt.show()

In [None]:
#Plot
fig, ax = plt.subplots(figsize=(4, 4))
plt.pie(df['type'].value_counts(),labels=df['type'].value_counts().index,explode=[0.05,0],
        autopct='%1.1f%%',colors=['#db0000','#564d4d'],startangle=90,shadow=True)

#Insights
fig.text(0.125,0.95,'Netflix Content Distribution', fontfamily='Arial',fontsize=16,fontweight="bold")
fig.text(0.125,0.9,'Almost 2/3 of the content is Movies.', fontfamily='Arial',fontsize=8)
plt.show()


In [None]:
#Firstly reduce the name of countries to their short forms
df['country'].replace('United States', 'USA', inplace=True)
df['country'].replace('United Kingdom', 'UK',inplace=True)
df['country'].replace('South Korea', 'S. Korea',inplace=True)

#Getting the top 10 countries
country = df['country'].value_counts().sort_values(ascending=False)
topCountry = country.head(10)

In [None]:
#Plot
fig, ax = plt.subplots(figsize=(8, 5))
color_map = ['#564d4d' for _ in range(10)]
color_map[0] = color_map[1] = color_map[2] =  '#db0000' # color highlight
ax.bar(topCountry.index, topCountry, width=0.5, 
       linewidth=0.6,color=color_map)

#annotation
for i in topCountry.index:
    ax.annotate(f"{topCountry[i]}", 
                   xy=(i, topCountry[i] + 100),
                   va = 'center', ha='center',fontweight='light', fontfamily='Arial')

# Remove border from plot
for s in ['right','top']:
    ax.spines[s].set_visible(False)
    
# Insights
fig.text(0.125,0.975,'Top 10 Country on Netflix', fontfamily='Arial',fontsize=16,fontweight="bold")
fig.text(0.125,0.935,'USA stands out significantly since Netflix is a US-based company.', fontfamily='Arial',fontsize=8)
fig.text(0.575,0.935,'| India and UK take the 2 and 3 positions respectively.', fontfamily='Arial',fontsize=8)
plt.show()

In [None]:
#Disturbution Percentage
contentPercentage = df[['type', 'country']].groupby('country')['type'].value_counts().unstack().loc[topCountry.index]


#Plot
fig, ax = plt.subplots(figsize=(8, 6))
sns.countplot(x="country",hue="type",order=topCountry.index,data=df,palette=['#db0000','#564d4d'])

# Remove border from plot
for s in ['right','top']:
    ax.spines[s].set_visible(False)

# Insights
fig.text(0.125,0.975,"Top 10 Country's content distribution", fontfamily='Arial',fontsize=16,fontweight="bold")
fig.text(0.125,0.935,f"- India has very high percentage of movies .i.e {contentPercentage.values[1][0]/(contentPercentage.values[1].sum())*100:.3}% for Movies and only {contentPercentage.values[1][1]/(contentPercentage.values[1].sum())*100:.3}% for TV shows", fontfamily='Arial',fontsize=8)
fig.text(0.125,0.905,"- Suprisingly Japan and South Korea have more number of TV shows as compared to Movies.", fontfamily='Arial',fontsize=8)
plt.show()

In [None]:
#Disturbution Percentage
rating = df['target_audience'].value_counts().index
rating_values = df[['type', 'target_audience']].groupby('target_audience')['type'].value_counts().unstack().loc[rating]
rating_values['sum'] = rating_values.sum(axis=1)
ratingPercentage = ((rating_values.T/rating_values['sum']).T[['Movie','TV Show']].sort_values(by='Movie',ascending=True))

#Plot
fig, ax = plt.subplots(figsize=(12,6),)

ax.barh(ratingPercentage.index, ratingPercentage['Movie'], 
        color='#db0000',label='Movie',left=ratingPercentage['TV Show'])
ax.barh(ratingPercentage.index, ratingPercentage['TV Show'], 
        color='#564d4d', label='TV Show')

# Remove border from plot
for s in ['right','top','bottom','left']:
    ax.spines[s].set_visible(False)
    
#Setting the labels
ax.set_xticks([])
ax.set_yticklabels(ratingPercentage.index, fontfamily='Arial', fontsize=12)

#Annotation
for i in ratingPercentage.index:
    ax.annotate(f"{ratingPercentage['TV Show'][i]*100:.4}%", 
                   xy=(ratingPercentage['TV Show'][i]/2,i),
                   va = 'center', ha='center',fontweight='bold', fontfamily='Arial',color="White")
for i in ratingPercentage.index:
    ax.annotate(f"{ratingPercentage['Movie'][i]*100:.4}%", 
                   xy=(ratingPercentage['Movie'][i]+ratingPercentage['TV Show'][i]/3,i),
                   va = 'center', ha='center',fontweight='bold', fontfamily='Arial',color="White")
#Insights
fig.text(0.125,0.975,"Content distribution based on ratings", fontfamily='Arial',fontsize=24,fontweight="bold")
fig.text(0.125,0.915,f"- Both Adults and Teens have almost identical distribution", fontfamily='Arial',fontsize=12)
fig.text(0.125,0.875,"- While kids have almost an equal distribution between Movies and TV Shows", fontfamily='Arial',fontsize=12)
fig.text(0.77,0.875,"TV Shows |",fontfamily='Arial',fontsize=8,color="#564d4d",fontweight="bold")
fig.text(0.83,0.875,"Movie",fontfamily='Arial',fontsize=8,color="#db0000",fontweight="bold")
plt.show()

In [None]:
#Disturbution
yearAdded = df.groupby('type')['year_added'].value_counts().unstack().fillna(0).loc[['TV Show','Movie']].cumsum(axis=0).T

#Plot
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
color = ["#db0000", "#564d4d"]

for i, contentSum in enumerate(df['type'].value_counts().index):
    contentSum = yearAdded[contentSum]
    ax.fill_between(contentSum.index, 0, contentSum.values, color=color[i], label=contentSum)
    
# Remove border from plot
for s in ['right','top','bottom','left']:
    ax.spines[s].set_visible(False)
    
#Setting the labels
ax.yaxis.tick_right()
ax.set_xlim(2008,2020) 
plt.xticks(np.arange(2008, 2021, 1)) 
ax.tick_params(axis=u'both', which=u'both',length=0)

#Insights
ax.grid(False)
fig.text(0.13,0.2,"TV Shows |",fontfamily='Arial',fontsize=12,color="#564d4d",fontweight="bold")
fig.text(0.22,0.2,"Movie",fontfamily='Arial',fontsize=12,color="#db0000",fontweight="bold")
fig.text(0.125,0.8,"Content added over time", fontfamily='Arial',fontsize=24,fontweight="bold")
fig.text(0.125,0.72,f"- There was an exponential rise from 2015 \n and Netflix had  over 2000+ content added by the year 2019", fontfamily='Arial',fontsize=12)
fig.text(0.125,0.68,f"- It seems Netflix focused more on Movies than TV Shows", fontfamily='Arial',fontsize=12)

plt.show()

In [None]:
month_order = ['January','February','March','April','May','June','July','August','September','October','November','December']

df['month_name_added'] = pd.Categorical(df['month_name_added'], categories=month_order, ordered=True)

In [None]:
#Distirbution
monthAdded = df.groupby('type')['month_name_added'].value_counts().unstack().fillna(0).loc[['TV Show','Movie']].cumsum(axis=0).T

#Plot
fig, ax = plt.subplots(1, 1, figsize=(14, 6))
color = ["#db0000", "#564d4d"]

for i, contentSum in enumerate(df['type'].value_counts().index):
    contentSum = monthAdded[contentSum]
    ax.fill_between(contentSum.index, 0, contentSum.values, color=color[i], label=df['month_name_added'])
    
#Set Labels
ax.yaxis.tick_right()
ax.set_xticklabels(monthAdded.index, fontfamily='Arial', rotation=0) 
ax.tick_params(axis=u'both', which=u'both',length=0)

# Remove border from plot
for s in ['right','top','bottom','left']:
    ax.spines[s].set_visible(False)
    
#Insights
ax.grid(False)
fig.text(0.155,0.825,"TV Shows |",fontfamily='Arial',fontsize=12,color="#564d4d",fontweight="bold")
fig.text(0.2325,0.825,"Movie",fontfamily='Arial',fontsize=12,color="#db0000",fontweight="bold")
fig.text(0.15,1,"Content added per month", fontfamily='Arial',fontsize=24,fontweight="bold")
fig.text(0.15,0.95,f"- From October to January, Netflix has added most of its content", fontfamily='Arial',fontsize=12)
fig.text(0.15,0.91,f"- There is a dip in the month of February", fontfamily='Arial',fontsize=12)
plt.show()

In [None]:
#Disturbution
df['count'] = 1
data = df.groupby('country')[['country','count']].sum().sort_values(by='count',ascending=False).reset_index()[:10]
data = data['country']

country_rating = df.loc[df['country'].isin(data)]
country_rating = pd.crosstab(country_rating['country'],country_rating['target_audience'],normalize = "index").T

country_order = ['USA', 'India', 'UK', 'Canada', 'Japan', 'France', 'S. Korea', 'Spain',
       'Mexico']

age_order = ['Kids','Older Kids','Teens','Adults']

#Plot
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#564d4d','#831010','#db0000'])
fig, ax = plt.subplots(1, 1, figsize=(12, 12))

sns.heatmap(country_rating.loc[age_order,country_order],cmap=cmap,square=True, linewidth=2.5,cbar=False,
            annot=True,fmt='1.0%',vmax=.6,vmin=0.05,ax=ax,annot_kws={"fontsize":12})

#Set Labels
ax.set_ylabel('')    
ax.set_xlabel('')
ax.tick_params(axis=u'both', which=u'both',length=0)

#Insights
fig.text(0.13,0.75,"Target Audience distribution based on contents per country", fontfamily='Arial',fontsize=24,fontweight="bold")
fig.text(0.13,0.725,f"- Interestingly most of the content in India is targeted to Teens", fontfamily='Arial',fontsize=12)
fig.text(0.13,0.705,f"- Spain and Mexico are the countries with most of the content for Adults ", fontfamily='Arial',fontsize=12)
plt.show()

In [None]:
#Distirbution
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#564d4d','#831010','#db0000'])
title = str(list(df['description'])).replace(',', '').replace('[', '').replace("'", '').replace(']', '').replace('.', '')

#Plot
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
wordcloud = WordCloud(background_color = 'white', width = 500,  height = 200,colormap=cmap, max_words = 150).generate(title)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
#Coclusion
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
fig.text(0.13,0.75,"Conclusion", fontfamily='Arial',fontsize=30,fontweight="bold")
fig.text(0.13,0.65,"Thanks to Joshua Swords for his wonderful notebook (https://www.kaggle.com/joshuaswords/netflix-data-visualization/)", fontfamily='Arial',fontsize=24,fontweight="regular")
plt.axis('off')
plt.show()