In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame as DF
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
path = '/kaggle/input/netflix-shows/netflix_titles.csv'


netflix = pd.read_csv(path)
raw = netflix

# preview
netflix.head()

# About the dataset

In [None]:
print('The dataset consists of: \n {} features.' .format(netflix.shape[1]))
print('{} observations.'.format(netflix.shape[0]))

In [None]:
netflix.info()

Only 1 column is listed as 'int64'.
<br> To make a more in-depth exploration of the data, 'duration' can be converted from categorical to numerical type.<br>

## Null Values

In [None]:
# Exact count of missing data per columns :

netflix.isna().sum()

The 'director' columns a high proportion of missing data.
<br> It will be difficult to fill with a substitute value without introducing bias in the data.<br>

In [None]:
# For further exploration, the 'director' , 'show_id' , 'cast', 'date_added' columns will not be considered.

netflix.drop(['director','show_id', 'cast'], 
            inplace=True,
            axis= 'columns')


In [None]:
print('Now the remaining null values are : ')

netflix.isna().sum()[netflix.isna().sum()>0]

In [None]:
# Lets display some of the observations will null values as 'country'
null_select = netflix.country.isna()
netflix[netflix.country.isna()].head(5)

In [None]:
print('The mode for the country column is :')

netflix.country.mode(dropna=True)

If we look at the titles for the dataset extract above, it does not seem convenient to replace the null values with 'United States'.
<br>Therefore, we will create a new category called 'International' that will regroup the unkown countries.<br>

In [None]:
netflix.country.fillna('International', inplace=True)
netflix[null_select].head()

The null values were successfully replaced by 'International'.
<br>We shall drop the 7 observations for which the rating feature is null.<br>

In [None]:
netflix.dropna(subset=['rating','date_added'] , axis= 'index', inplace=True)

#lets check the dataset after tackling the null values
netflix.info()

# Content distribution by release year

In [None]:
plt.figure(figsize=(14,5))

sns.histplot(x=netflix.release_year)
plt.xlim(1920, None)
plt.show()


print('Most of the content was released in the year : {}'.format(netflix.release_year.value_counts().idxmax()))
print('For that release year, a total of {} contents is present on Netflix'. format(netflix.release_year.value_counts().max()))

# Is the release year distribution same for both movies and TV shows?

In [None]:
fig = plt.figure(figsize=(14,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)


sns.kdeplot (x=netflix.release_year,
            hue=netflix.type,
            palette='mako',
            ax= ax1)

sns.boxplot (x=netflix.release_year,
            hue=netflix.type,
            palette='mako',
            ax=ax2)


plt.show()

Both movies and TV shows follow a left skewed distribution and appear to peak in the same timeframe.
<br> Due to the extreme skewness, the release_year has a large number of outliers.<br>

# What type of content is being added the most ?

In [None]:
# Quick look at what the data looks like

netflix.date_added.iloc[:3]

In [None]:
# Some manipulation is required to extract the year content was added from the 'date_added' field.

# The year will be stored in a new column of the 'netflix' dataset named 'year_added'
# Converting values to integer
for lab, row in netflix.iterrows():
    netflix.loc[lab,'year_added'] = int(row['date_added'][-4:])


netflix.year_added.iloc[:3]

In [None]:

data = DF(netflix.groupby(['year_added','type'])['title'].size())
totals = DF(netflix.groupby('year_added')['title'].size())

data_pct = data.div(totals,level='year_added')*100
data_pct=data_pct.unstack(-1)

# Stacked Area Plot

fig= plt.figure()
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)


sns.set_theme()

# First Vix
totals.plot.line(ax=ax1, figsize=(16,5))
ax1.get_legend().remove()
ax1.set_title('Content addition by year')
ax1.set_ylabel('Titles added')
ax1.set_xlim(2008,2020)


# Second Viz
data_pct.plot.area(ax=ax2, figsize=(16,5))
ax2.legend(['Movie','TV Show'])
ax2.set_ylabel('Percentage')
ax2.set_title('Content addition by type - yearwise')
ax2.set_xlim(2008,2020)
ax2.set_ylim(0,100)

plt.show()

The yearly content addition has increased drastically as from 2015. 
<br> The added contents are in majority movies and this has been the case since 2008.<br>

# What are the top 5 countries for Movies  and TV shows?

In [None]:
df= netflix
df_agg = df.groupby(['type','country']).agg({'title': np.size})
group = df_agg['title'].groupby('type', group_keys=False)
res = group.apply(lambda x: x.sort_values(ascending=False).head(5))
DF(res)

United States leads in both the 'Movie' and 'TV Shows'.<br>


# Are the other countries catching up with the United States?

In [None]:
# For this analysis, we will require 'type', 'country' and 'year_added'.
# We will only consider as from 2016 as this is the year where the contents have been increasing drastically.

Top5movie = ('United States','India','International','United Kingdom','Canada')
Top5tv = ('United States', 'International', 'United Kingdom', 'Japan', 'South Korea')

df_mov = netflix.loc    [(netflix['year_added'] >= 2016) 
                        & (netflix['year_added'] < 2021)
                        & (netflix['country'].isin(Top5movie)) 
                        & (netflix['type'] == 'Movie')]

df_tv = netflix.loc     [(netflix['year_added'] >= 2016) 
                        & (netflix['year_added'] < 2021)
                        & (netflix['country'].isin(Top5tv)) 
                        & (netflix['type'] == 'TV Show')]


mov_agg = DF(df_mov.groupby(['country','year_added'])['title'].size()).reset_index()
tv_agg = DF(df_tv.groupby(['country','year_added'])['title'].size()).reset_index()


fig = plt.figure(figsize=(16,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)



# Movie Plot
sns.lineplot    (data = 'move_agg',
                x= mov_agg.year_added,
                y= mov_agg.title,
                style= mov_agg.country,
                hue= mov_agg.country,
                palette='magma_r',
                markers=True,
                ax= ax1)

ax1.set_ylabel('Titles added')
ax1.set_title('Movies')
ax1.set_xlim(2016,2020)

# TV Show Plot
sns.lineplot    (data = 'tv_agg',
                x= tv_agg.year_added,
                y= tv_agg.title,
                style= tv_agg.country,
                hue= tv_agg.country,
                palette='magma_r',
                markers=True,
                ax = ax2)

ax2.set_ylabel('Titles added')
ax2.set_title('TV Shows')
ax2.set_xlim(2016,2020)
plt.show()

For Movies:
 <br> - India had almost caught up to the United States in terms of new contents in 2018.
 <br> - After 2018 new contents for the US went on increasing whereas for India were on the decline.
 <br> - The movie trend from 2019 to 2020 is on the decline overall illustrating the impact of the pandemic on movie releases.

 
 <br><br>For TV Shows:
 <br> - TV Shows from the US have been on a constant rise since 2016.
 <br> - The remaining countries seem to converge towards 50 new titles yearly.
 <br> - Surprisingly, the TV shows have managed to keep the increasing release trend for 2020 despite the pandemic situation.<br>

 

# Are movies getting lengthier?

In [None]:
# Selecting movies only
# Extracting values from 'duration' column

movie_len = netflix.loc[netflix['type'] == 'Movie']
movie_len.duration = movie_len.duration.str.extract('(\d+)').astype(float)

movie_len_comp = movie_len.loc[(movie_len['country'] == 'United States') | (movie_len['country'] == 'India')]


In [None]:

fig = plt.figure(figsize=(16,6))

ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122) 

sns.lineplot(
    data= movie_len,
    x='release_year',
    y='duration',
    ax= ax1)

sns.lineplot(
    data= movie_len_comp,
    x='release_year',
    y='duration',
    hue='country',
    ax= ax2)

ax1.set_title('Overall Movie Duration')
ax2.set_title('USA and India Movie Duration')
ax1.set_xlim(1970,2020)
ax2.set_xlim(1970,2020)
plt.show()

Overall, the movies duration since 2010 are quite stable around 100 mins.
<br> Comparing the Top 2 countries in the movies category, movies of Indian origin were lengthier in the years 1990 - 2000. 
<br> Since 2000, the duration trend is decreasing from 160min (in 1999) to approx 120min (in 2019).

# What are the trending categories for the past 7 years?

We shall extract the categories from the 'listed_in' column.
<br> A single Title can have several tags which are combined in the 'listed_in' column.
<br> Some data manipulation will be required to extract the individual tags.

In [None]:
# Selecting the useful columns
cols = ['type','year_added','listed_in','country','description']
df = netflix.loc[:,cols]


# Breaking 'listed_in' tags into individual entries
df.listed_in = df.listed_in.str.split(pat=', ')
df_xp = df.explode('listed_in')

#grouping
grp = DF(df_xp.groupby(['type','listed_in']).size())
grp.columns = ['count']
grp_sort = DF(grp['count'].groupby(['type'],group_keys=False).apply(lambda x : x.sort_values(ascending=False).head(5)))
grp_sort



In [None]:
grp_sort.reset_index(inplace=True)

# Top5 Categories
mov_cat = grp_sort[grp_sort['type'] == 'Movie']['listed_in'].to_list()
tv_cat = grp_sort[grp_sort['type'] == 'TV Show']['listed_in'].to_list()

In [None]:
mov_comp = df_xp[df_xp['listed_in'].isin(mov_cat)]
mov_grp = DF(mov_comp.groupby(['listed_in','year_added'])['description'].size())
mov_data = mov_grp.reset_index()

tv_comp = df_xp[df_xp['listed_in'].isin(tv_cat)]
tv_grp = DF(tv_comp.groupby(['listed_in','year_added'])['description'].size())
tv_data = tv_grp.reset_index()


fig = plt.figure(figsize=(16,6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

sns.lineplot(
    data= mov_data,
    x = 'year_added',
    y = 'description',
    hue= 'listed_in',
    style= 'listed_in',
    markers=True,
    palette='magma_r',
    ax = ax1
)

sns.lineplot(
    data= tv_data,
    x = 'year_added',
    y = 'description',
    hue= 'listed_in',
    style= 'listed_in',
    markers=True,
    palette='magma_r',
    ax = ax2
)

ax1.set_title('Movie Categories')
ax1.set_xlim(2013,2020)
ax1.set_ylabel('Content added')

ax2.set_title('TV Show Categories')
ax2.set_xlim(2013,2020)
ax2.set_ylabel('Content added')



plt.show()

Ignoring 'International Movies' and 'International TV Shows' :
<br> For both Movies and TV Shows, the leading 2 categories are 'Dramas' and 'Comedies'. <br>

# Dramma TV Shows Wordcloud

In [None]:
# Imports
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Gathering all the words from Description of Drama movies 
tv_desc = df_xp[df_xp['listed_in']=='TV Dramas']
text = ''.join(desc for desc in tv_desc.description)



stopwords = set(STOPWORDS)
#stopwords.update(['S','s',])

wordcloud = WordCloud(stopwords=stopwords , background_color='white').generate(text)


# Display the generated image
plt.figure(figsize=(14,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
