## Netlfix Exploratory Data Analysis
![netflix](https://www.popsci.com/resizer/O1PqrHRUbBYvZnwQKq0mW2DOiU0=/2068x1552/arc-anglerfish-arc2-prod-bonnier.s3.amazonaws.com/public/AUSBJ7SDRWXMD7VXVNJASUT6ME.jpg)

In [None]:
import pandas as pd
import numpy as np
import plotly
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sb
import missingno as mno

%matplotlib inline

In [None]:
netflix_df = pd.read_csv('../input/netflix-shows/netflix_titles.csv', index_col = False)

In [None]:
netflix_df.head()

In [None]:
mno.matrix(netflix_df)

In [None]:
netflix_df.info()

In [None]:
netflix_df.columns

In [None]:
# removing show_id because it provides no insights and also removing director and cast becuase of high NaN
clean_df = netflix_df.drop(['show_id','director','cast'], axis = 1)

In [None]:
clean_df.info()

In [None]:
clean_df['year'] = pd.DatetimeIndex(clean_df['date_added']).year.fillna(2009).astype('object')
# filling 10 NaN values from TV Shows to 2009 which was a year that actually had 0 TV Shows added
clean_df.info()

In [None]:
movies = clean_df.loc[clean_df.type == 'Movie']

In [None]:
tv_shows = clean_df.loc[clean_df.type == 'TV Show']

In [None]:
tv_country = tv_shows['country'].value_counts()[tv_shows['country'].value_counts(normalize=True)>0.005]
tv_country_list = list(tv_country.index)

In [None]:
plt.figure(figsize = (30,15))
plt.title('TV Show count per Country', fontsize=22)
plt.tick_params(labelsize = 16)
sb.barplot(y = tv_country.index, x = tv_country.values, palette="Blues_d")
sb.set_style('white')
plt.show

In [None]:
movie_country = movies['country'].value_counts()[movies['country'].value_counts(normalize=True)>0.005]
movie_country_list = list(movie_country.index)

In [None]:
plt.figure(figsize = (30,15))
plt.title('Movie count per Country', fontsize=22)
plt.tick_params(labelsize = 16)
sb.barplot(y = movie_country.index, x = movie_country.values, palette="Oranges_d")
sb.set_style('white')
plt.show

In [None]:
# number of tv shows
tv_shows['type'].value_counts()

In [None]:
# number of movies
movies['type'].value_counts()

In [None]:
# removing 'min' from duration so I can use as numeric and average
movies['duration'] = movies['duration'].replace({'min':''}, regex=True)

In [None]:
movies['duration'] = pd.to_numeric(movies['duration'])

In [None]:
tvrelease_year = tv_shows['release_year'].value_counts().sort_index()
print(tvrelease_year.head())
movierelease_year = movies['release_year'].value_counts().sort_index()
print(movierelease_year.head())

In [None]:
plt.figure(figsize = (16, 5))
sb.set_style('darkgrid')
plt.plot(tvrelease_year.index, tvrelease_year.values, color = 'b', label = 'TV Shows')
plt.plot(movierelease_year.index, movierelease_year.values, color = 'orange', label = 'Movies')

# showing the drop-off in production during 2020 due to COVID
plt.axvline(x = 2020, color = 'salmon', label = 'COVID-19')

plt.xticks(list(range(1925, 2022, 4)), fontsize = 12)
plt.yticks(fontsize = 12)
plt.title('TV Shows and Movies per year', fontsize = 18)
plt.xlabel('Year', fontsize = 14)
plt.ylabel('Release Count', fontsize = 14)
plt.legend()
plt.show

In [None]:
tv_added = tv_shows['year'].value_counts().sort_index()
movie_added = movies['year'].value_counts().sort_index()

In [None]:
plt.figure(figsize = (16, 5))
sb.set_style('darkgrid')
plt.plot(tv_added.index, tv_added.values, color = 'b', label = 'TV Shows added')
plt.plot(movie_added.index, movie_added.values, color = 'orange', label = 'Movies added')

# showing the drop-off in production during 2020 due to COVID
plt.axvline(x = 2020, color = 'salmon', label = 'COVID-19')

plt.xticks(list(range(2008, 2022, 1)), fontsize = 12)
plt.yticks(fontsize = 12)
plt.title('TV Shows and Movies added per year', fontsize = 18)
plt.xlabel('Year', fontsize = 14)
plt.ylabel('Content added count', fontsize = 14)
plt.legend()
plt.show

In [None]:
# df for release year and duration
rel_dur = movies[['release_year','duration']]

In [None]:
# average duration per release year
avg_dur = rel_dur.groupby(pd.Grouper(key='release_year')).mean()
avg_dur = avg_dur.sort_index()

In [None]:
plt.figure(figsize = (16, 5))
sb.set_style('white')
plt.plot(avg_dur.index, avg_dur.values, color = 'green', label = 'Movie Duration')

plt.xticks(list(range(1940, 2022, 4)), fontsize = 12)
plt.yticks(fontsize = 12)
plt.title('Average Movie Duration since 1943', fontsize = 18)
plt.xlabel('Year', fontsize = 14)
plt.ylabel('Average Duration', fontsize = 14)
plt.legend()
plt.show

In [None]:
# rating df with all ratings as individual columns
rat_df = movies[['country', 'rating']]
rat_df.dropna()
rat_df.loc[:,'TV-MA'] = np.where(rat_df['rating']=='TV-MA', 1, 0)
rat_df.loc[:,'TV-14'] = np.where(rat_df['rating']=='TV-14', 1, 0)
rat_df.loc[:,'R'] = np.where(rat_df['rating']=='R', 1, 0)
rat_df.loc[:,'TV-PG'] = np.where(rat_df['rating']=='TV-PG', 1, 0)
rat_df.loc[:,'PG-13'] = np.where(rat_df['rating']=='PG-13', 1, 0)
rat_df.loc[:,'PG'] = np.where(rat_df['rating']=='PG', 1, 0)
rat_df.loc[:,'TV-Y'] = np.where(rat_df['rating']=='TV-Y', 1, 0)
rat_df.loc[:,'TV-G'] = np.where(rat_df['rating']=='TV-G', 1, 0)
rat_df.loc[:,'TV-Y7'] = np.where(rat_df['rating']=='TV-Y7', 1, 0)
rat_df.loc[:,'NR'] = np.where(rat_df['rating']=='NR', 1, 0)
rat_df.loc[:,'G'] = np.where(rat_df['rating']=='G', 1, 0)
rat_df.loc[:,'Total'] = rat_df.sum(numeric_only=True, axis=1)

In [None]:
# grouping country to sum each rating
rat_country = rat_df.groupby(pd.Grouper(key='country')).sum()

In [None]:
rat_country = rat_country.sort_values(by=['Total'], ascending = False)
# top 20 countries and their ratings
rat_country[:20]

In [None]:
# percent of ratings per country
per_country = rat_country
per_country['TV-MA'] = (rat_country['TV-MA']/rat_country['Total'])
per_country['TV-14'] = (rat_country['TV-14']/rat_country['Total'])
per_country['R'] = (rat_country['R']/rat_country['Total'])
per_country['TV-PG'] = (rat_country['TV-PG']/rat_country['Total'])
per_country['PG-13'] = (rat_country['PG-13']/rat_country['Total'])
per_country['PG'] = (rat_country['PG']/rat_country['Total'])
per_country['TV-Y'] = (rat_country['TV-Y']/rat_country['Total'])
per_country['TV-G'] = (rat_country['TV-G']/rat_country['Total'])
per_country['TV-Y7'] = (rat_country['TV-Y7']/rat_country['Total'])
per_country['NR'] = (rat_country['NR']/rat_country['Total'])
per_country['G'] = (rat_country['G']/rat_country['Total'])
per_country['Total'] = 1

In [None]:
per_country[:20]

In [None]:
labels = per_country.index[:15]

fig, ax = plt.subplots()

ax.bar(labels, per_country['Total'][:15], .6, label="Total")
ax.bar(labels, per_country['TV-MA'][:15], .6, label='TV-MA')
ax.bar(labels, per_country['TV-14'][:15], .6, label='TV-14')
ax.bar(labels, per_country['R'][:15], .6, label='R')
ax.bar(labels, per_country['TV-PG'][:15], .6, label='TV-PG')
ax.bar(labels, per_country['PG-13'][:15], .6, label='PG-13')
ax.bar(labels, per_country['PG'][:15], .6, label='PG')
ax.bar(labels, per_country['TV-Y'][:15], .6, label='TV-Y')
ax.bar(labels, per_country['TV-G'][:15], .6, label='TV-G')
ax.bar(labels, per_country['TV-Y7'][:15], .6, label='TV-Y7')
ax.bar(labels, per_country['NR'][:15], .6, label='NR')
ax.bar(labels, per_country['G'][:15], .6, label='G')


ax.set_ylabel('Counts', fontsize = 18)
ax.set_xlabel("Countries", fontsize = 18)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 16, rotation = 70)
ax.set_title('Movie rating per country', fontsize = 24)
ax.legend()

plt.gcf().set_size_inches(20,10)
plt.show()

## Final Thoughts:
Given the Covid-19 pandemic it seems like Netflix worked to bring more TV-Shows to their audience, while there was an obvious drop in 2020 productions released.  
It is always interesting to see the drop off in movie duration over the years as well, and it seems to still be trending downwards.  
No surprise that the U.S leads in production, it was very cool to see how the ratings spit up between the countries,  where the U.S was heavy in TV-MA but fairly spread out between other ratings where other countries like Spain had 83% of their movies were rated TV-MA.