## Netflix Data Visualization

Hi! I did this notebook to get some data visualization training. If you find it helpful, please upvote! :)


## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import warnings
warnings.filterwarnings('ignore')
sns.set_style('white')

## Loading data

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
df.head()

In [None]:
df.info()

## Data Visualization

### Movies vs TV Shows 

As we can see below, movies have a more expressive amount of attractions in Netflix.

In [None]:
types_df = df.groupby(['type']).size().reset_index(name='counts')
types_df['percent'] = round(types_df ['counts']/sum(types_df ['counts']),2)*100

In [None]:
types_df['percent'][0]

In [None]:
plt.figure(figsize=(12,6))
g = sns.barplot(x='percent',y='type',palette="rocket_r", data=types_df, orient='h')

g.text(0, -0.7, 'Attraction type in Netflix by percent (%)', 
       fontsize=14, fontweight='bold', fontfamily='Ubuntu',color='black')

col='percent'
for i in range(2):
    g.annotate(f'{int(types_df[col][i])}%', 
                xy=(types_df[col][i]/2, i),
                ha = 'center', va='center',fontsize=50, fontweight='bold', 
                fontfamily='Ubuntu', color='white')
    g.annotate('Movie' if i==0 else 'TV Show', 
                xy=(types_df[col][i]/2, i+0.2),
                ha = 'center', va='center',fontsize=12, fontweight='bold', 
                fontfamily='Ubuntu', color='white')

for i in ['top', 'left', 'right', 'bottom']:
    g.spines[i].set_visible(False)
    
g.set_xlim(0,100)
g.set(xticklabels=[],yticklabels=[])
plt.ylabel('')
plt.xlabel('')

### Attractions ratings
We can also see the difference between the ratings of each one!

In [None]:
plt.figure(figsize=(14,7))
g =sns.countplot(df['rating'], hue=df['type'],palette="rocket_r")
g.text(0, 2200, 'Rating by attraction type in Netflix', 
       fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

for i in ['top', 'left', 'right']:
    g.spines[i].set_visible(False)
    
plt.legend(loc='upper right',frameon=False,prop={'size': 15,'family':'Ubuntu'})
plt.xlabel('Rating',fontfamily='Ubuntu')
plt.ylabel('')

### Movies and TV Shows production
And if you wonder which countries have the most movie production on Netflix...

In [None]:
movies_df =  df[ df['type'] == 'Movie' ]
top5countries_movies = movies_df.groupby(['country']).size().sort_values(ascending=False)[0:5]

plt.figure(figsize = (12,6))
g = sns.barplot(x=top5countries_movies.index, y=top5countries_movies, palette='rocket')
g.text(0, 2200, 'Top 5 countries in Netflix movie production', 
       fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

for i in ['top', 'left', 'right']:
    g.spines[i].set_visible(False)
    
for i in g.patches:
    g.text(i.get_x()+i.get_width()/2.5,
           i.get_height()+60,
           round(i.get_height()),
           fontsize='18',
           fontfamily='Ubuntu')

g.set(yticklabels=[])
plt.xlabel('')
plt.ylabel('')

Looks like USA and India are doing a good job. Hollywood and Bollywood ftw?

Looking at the top 5 countries again, but for TV Shows, in the chart below we can notice USA still leads the productions!

In [None]:
tvshows_df =  df[ df['type'] == 'TV Show' ]
top5countries_tvshows = tvshows_df.groupby(['country']).size().sort_values(ascending=False)[0:5]

plt.figure(figsize = (12,6))
g = sns.barplot(x=top5countries_tvshows.index, y=top5countries_tvshows, palette='rocket')
g.text(0, 900, 'Top 5 countries in netflix TV Show production', 
       fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

for i in ['top', 'left', 'right']:
    g.spines[i].set_visible(False)
    
for i in g.patches:
    g.text(i.get_x()+i.get_width()/2.5,
           i.get_height()+60,
           round(i.get_height()),
           fontsize='18',
           fontfamily='Ubuntu')

g.set(yticklabels=[])
plt.xlabel('')
plt.ylabel('')

Have you ever wondered if over time Netflix invests more in movies or TV shows?

In the chart below, you'll notice that Netflix was always increasing the amount of movies and TV Shows added over the years. However, in 2020, we got a decreasing number of movie additions, perhaps due to the pandemic or just a decreasing amount of investment in movies, as TV Shows are still increasing.

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
index = [2014,2015,2016,2017,2018,2019,2020]

ts_df = df[df['year_added']>2013]
ts_df = ts_df[ts_df['year_added']<2021].groupby('type')['year_added'].value_counts().unstack().T

fig, ax = plt.subplots(1,1, figsize=(12,6))
fig.text(0.15,1,'Amount of Movies and TV Shows added over years', 
         fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

g1 = sns.lineplot(x=ts_df.index,y=ts_df['Movie'], color='#593262',label='Movie')
g2 = sns.lineplot(x=ts_df.index,y=ts_df['TV Show'], color='#db6e59',label='TV Show')

    
for i in ['top', 'left', 'right']:
    ax.spines[i].set_visible(False)
    
ax.legend(loc='upper left',frameon=False,prop={'size': 15, 'family': 'Ubuntu'})
plt.xlabel('')
plt.ylabel('')

In [None]:
def quarter_expression(i):
    if i==1:
        return '1st Quarter'
    elif i==2:
        return '2nd Quarter'
    elif i==3:
        return '3rd Quarter'
    else:
        return '4th Quarter'
    
df['quarter_added'] = df['date_added'].dt.quarter
qrt_df = df['quarter_added'].value_counts()
prct_qrt = pd.DataFrame(round(qrt_df/sum(qrt_df),2)).T

fig, ax = plt.subplots(1,1,figsize=(12, 4))

ax.barh(prct_qrt.index, prct_qrt[1.0],color='#593262', alpha=0.9)
ax.barh(prct_qrt.index, prct_qrt[2.0],color='#772b58', alpha=0.9, 
        left=prct_qrt[1.0])
ax.barh(prct_qrt.index, prct_qrt[3.0],color='#b53158', alpha=0.9, 
        left=prct_qrt[1.0]+prct_qrt[2.0])
ax.barh(prct_qrt.index, prct_qrt[4.0],color='#db6e59', alpha=0.9, 
        left=prct_qrt[1.0]+prct_qrt[2.0]+prct_qrt[3.0])

fig.text(0.15,1,'Percentage of attractions added per quarter', 
         fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

sum_list = []
for i in range(1,5):
    ax.annotate(f'{int((prct_qrt[i][prct_qrt.index])*100)}%', 
                   xy=(sum(sum_list)+prct_qrt[i][prct_qrt.index]/2, prct_qrt.index),
                   ha = 'center', va='center',fontsize=40, fontweight='bold', 
                   fontfamily='Ubuntu', color='white')
    ax.annotate(quarter_expression(i), 
                   xy=(sum(sum_list)+prct_qrt[i][prct_qrt.index]/2, -0.25),
                   ha = 'center', va='center',fontsize=15, fontweight='bold', 
                   fontfamily='Ubuntu', color='white')
    
    sum_list.append(prct_qrt[i][prct_qrt.index])

for i in ['top', 'left', 'right','bottom']:
    ax.spines[i].set_visible(False)
    
ax.set(yticklabels=[],xticklabels=[])
ax.set_xlim(0, 1)

### Genres

We can see an interesting word cloud of movies and tv shows genres! It seems that romantic and comedy productions are very attractive to them, right?

In [None]:
df['genre'] = df['listed_in'].apply(lambda x: x.replace(' TV','').replace('TV ','').replace(' ,',',').replace(', ',',').split(','))

stopwords = set(STOPWORDS)
stopwords.update(['Shows','Movies','British','International'])

all_summary = ' '.join(' '.join(s) for s in df['genre'])
wordcloud = WordCloud(stopwords=stopwords,
                      background_color='white',
                      colormap='rocket',
                      width=1600, height=800).generate(all_summary)

fig, ax = plt.subplots(figsize=(10,6))

fig.text(0.15,1,'Word cloud of Netflix attractions genres', 
         fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

ax.imshow(wordcloud, interpolation='bilinear')
ax.set_axis_off()

plt.imshow(wordcloud);

### Released year
We saw some information about adding films and TV shows over time, but maybe (or not) you wondered if they are recent productions.

Given the year a movie or tv show is produced, how long does Netflix usually take to put it in the catalog?

First, let's see a chart representing the release year of productions.

In [None]:
plt.figure(figsize = (12,6))
g = sns.countplot(df['release_year'],palette='rocket_r')

g.text(0.15,1100,'Release year of Netflix attractions', 
         fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

for i in ['top', 'left', 'right']:
    g.spines[i].set_visible(False)
    
g.yaxis.tick_right()
g.xaxis.set_major_locator(ticker.MultipleLocator(9))
plt.ylabel('')
plt.xlabel('')

It looks like we have a 1925 production. Damn, almost a century! Let's see what attraction is this and its type.



In [None]:
df[ df['release_year'] == 1925][['title','type']]

Well, the title says it all.

Continuing our observations, I think it would be a good idea to look at the (average) year in which the information is produced and the (average) year in which it is posted on Netflix!



In [None]:
g1_x = [round(df[ df['type'] == 'TV Show']['release_year'].mean()), 
        round(df[ df['type'] == 'Movie']['release_year'].mean())]
g2_x = [round(df[ df['type'] == 'TV Show']['year_added'].mean()),
        round(df[ df['type'] == 'Movie']['year_added'].mean())]


In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,5))

g1 = plt.plot([g1_x[0],g2_x[0]],[1,1],color='gray',linewidth=3)
g2 = plt.plot([g1_x[1],g2_x[1]],[0.5,0.5],color='gray',linewidth=3)
g3 = plt.plot(g1_x, [1,0.5], 'o',markersize=20,color='#593262',label='Release year')
g4 = plt.plot(g2_x, [1,0.5],'o', markersize=20,color='#b53158', label='Year added')


fig.text(0.15,0.9,'Comparation of production release year and year added on Netflix', 
         fontsize=18, fontweight='bold', fontfamily='Ubuntu',color='black')

for i in ['top', 'left', 'right']:
    ax.spines[i].set_visible(False)

ax.annotate('TV Shows', 
            xy=(2020, 1),
            ha = 'center', va='center',fontsize=15, fontweight='heavy', 
            fontfamily='Ubuntu', color='black')

ax.annotate('Movies', 
            xy=(2019.9, 0.5),
            ha = 'center', va='center',fontsize=15, fontweight='heavy', 
            fontfamily='Ubuntu', color='black')

ax.set(yticklabels=[])
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.legend(bbox_to_anchor=(0.28, 0.8),frameon=False,prop={'size': 17,'family':'Ubuntu'})
plt.ylim(0,3)
plt.xlim(2011,2020)

The difference between the year movies are added on Netflix and the year they are made is greater than that of TV shows.