# Netflix Project

In [17]:
# Import libraries
import pandas as pd
import numpy as np
import calendar
import plotly.express as px
import plotly.graph_objects as go

pd.options.display.max_rows = 200

In [18]:
# Load dataset
netflix = pd.read_csv('netflix_titles.csv')

## 0. Dataset Overview

In [19]:
# Check size of dataset
netflix.shape

(8807, 12)

In [20]:
# Check first 5 rows
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [21]:
# Statistics about dataset
netflix.describe(include='all')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
count,8807,8807,8807,6173,7982,7976,8797,8807.0,8803,8804,8807,8807
unique,8807,2,8807,4528,7692,748,1767,,17,220,514,8775
top,s1,Movie,Dick Johnson Is Dead,Rajiv Chilaka,David Attenborough,United States,"January 1, 2020",,TV-MA,1 Season,"Dramas, International Movies","Paranormal activity at a lush, abandoned prope..."
freq,1,6131,1,19,19,2818,109,,3207,1793,362,4
mean,,,,,,,,2014.180198,,,,
std,,,,,,,,8.819312,,,,
min,,,,,,,,1925.0,,,,
25%,,,,,,,,2013.0,,,,
50%,,,,,,,,2017.0,,,,
75%,,,,,,,,2019.0,,,,


In [22]:
# Check na values
netflix.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [23]:
# Check data types
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [24]:
# Update the data types and check
netflix['date_added'] = netflix['date_added'].str.strip() # remove leading and trailing spaces from date_added column
netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

In [25]:
# Get min and max dates 
netflix['date_added'].describe() # January 2008 to September 2021

count                             8797
mean     2019-05-17 05:59:08.436967168
min                2008-01-01 00:00:00
25%                2018-04-06 00:00:00
50%                2019-07-02 00:00:00
75%                2020-08-19 00:00:00
max                2021-09-25 00:00:00
Name: date_added, dtype: object

## 1. Genres on the platform

In [26]:
# Data cleaning and processing
# Split the listed_in column to have all the genres
netflix['genre_list'] = netflix['listed_in'].str.split(', ')
netflix_exploded_1 = netflix.explode('genre_list') # each row corresponds to a single genre for a content release
unique_genres = netflix_exploded_1['genre_list'].unique().tolist()
unique_genres.sort()
print("The list of unique genres on Netflix is:", unique_genres)
print("The number of unique genres on Netflix is:", len(unique_genres)) # 42 distinct genres

The list of unique genres on Netflix is: ['Action & Adventure', 'Anime Features', 'Anime Series', 'British TV Shows', 'Children & Family Movies', 'Classic & Cult TV', 'Classic Movies', 'Comedies', 'Crime TV Shows', 'Cult Movies', 'Documentaries', 'Docuseries', 'Dramas', 'Faith & Spirituality', 'Horror Movies', 'Independent Movies', 'International Movies', 'International TV Shows', "Kids' TV", 'Korean TV Shows', 'LGBTQ Movies', 'Movies', 'Music & Musicals', 'Reality TV', 'Romantic Movies', 'Romantic TV Shows', 'Sci-Fi & Fantasy', 'Science & Nature TV', 'Spanish-Language TV Shows', 'Sports Movies', 'Stand-Up Comedy', 'Stand-Up Comedy & Talk Shows', 'TV Action & Adventure', 'TV Comedies', 'TV Dramas', 'TV Horror', 'TV Mysteries', 'TV Sci-Fi & Fantasy', 'TV Shows', 'TV Thrillers', 'Teen TV Shows', 'Thrillers']
The number of unique genres on Netflix is: 42


In [27]:
# Visualization
genre_counts = netflix_exploded_1['genre_list'].value_counts().reset_index()
genre_counts.columns = ['genre', 'num_shows']
genre_counts

fig = px.bar(genre_counts.iloc[0:9, :], y='genre', x='num_shows', orientation='h', color_discrete_sequence=["purple"],
       category_orders= {'genre': genre_counts['genre'].tolist()}, title='Top 10 genres by number of shows on Netflix')

fig.update_layout(xaxis_title='Number of shows', showlegend=False, autosize=False)
fig.show()


Observations:
- We can see that the top genre is International Movies, which correspond to movies produced outside the USA with predominantly non-English dialogue track. It's not surprising as Neftlix is streaming worldwide so they need to include international movies to meet the needs of their non-US customer base.
- Then comes Dramas and Comedies.

## 2. Production countries

In [28]:
# Data cleaning and processing
netflix['country'] = netflix['country'].str.strip(',') # remove leading and trailing commas
netflix['country'] = netflix['country'].str.strip(' ') # remove leading and trailing spaces
netflix['country_list'] = netflix['country'].str.split(', ')
netflix_exploded_2 = netflix.explode('country_list') # create one row per country for each content release
nb_unique_countries = netflix_exploded_2['country_list'].nunique()
print(f"There are {nb_unique_countries} unique production countries.")
unique_countries = netflix_exploded_2['country_list'].dropna().unique().tolist()
unique_countries.sort()
print("The alphabetically sorted list of production countries is:", unique_countries)

There are 122 unique production countries.
The alphabetically sorted list of production countries is: ['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bangladesh', 'Belarus', 'Belgium', 'Bermuda', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands', 'Chile', 'China', 'Colombia', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'East Germany', 'Ecuador', 'Egypt', 'Ethiopia', 'Finland', 'France', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kuwait', 'Latvia', 'Lebanon', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malawi', 'Malaysia', 'Malta', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragu

In [29]:
# Visualization
country_counts = netflix_exploded_2['country_list'].value_counts(dropna=False).reset_index()
country_counts.columns = ['country', 'num_shows']
country_counts['country'].fillna('Unknown', inplace=True) # replace NA values by "Unknown" value

top_10_countries = country_counts.iloc[0:10, :] # extract top 10

colors = ['lightslategray',] * 10
colors[2] = 'crimson' # show "Unknown" category as red

fig = go.Figure(data=[go.Bar(
    x=top_10_countries['country'],
    y=top_10_countries['num_shows'],
    marker_color=colors, 
    text = top_10_countries['num_shows'],
    textposition='outside'
)])

fig.update_layout(title_text='Top 10 production countries among Netflix content', autosize=False, yaxis=dict(range=[0,4000]), 
                  xaxis_title='Production Country', yaxis_title='Number of shows')
fig.show()

Observations:
- Among netflix content, the top production country is USA with around 4000 shows out of 8807, which corresponds to almost half of content on Netflix. 
- India is in second position with around 1000 shows, which is 1/4 of US shows. 
- We have 831 shows in the dataset with unknown production country. This value is quite significant in regards with the other numbers.
- Then the rest of countries is essential Western countries except South Korea due to the increased popularity of K dramas worldwide.

In [32]:
fig = px.choropleth(data_frame=country_counts,
                    locations='country',
                    locationmode='country names',
                    color='num_shows',
                    title='Production Countries among Netflix Content',
                    labels={'num_shows': 'Number of shows'})

fig.update_layout(autosize=False)
fig.show()

Observations:
- Looking at the map, we can see that almost all countries in the world contribute to the production of shows available on Netflix. The exception lies in Africa especially West and Central Africa.

## 3. Dominant genre per country

In [None]:
# Data processing
netflix_exploded_3 = netflix_exploded_2.explode(['genre_list'])
netflix_exploded_3.head()

genre_country_counts = netflix_exploded_3.groupby(['country_list', 'genre_list']).size().reset_index()
genre_country_counts.columns = ['country', 'genre', 'num_shows']
genre_country_counts

Unnamed: 0,country,genre,num_shows
0,Afghanistan,Documentaries,1
1,Afghanistan,International Movies,1
2,Albania,Dramas,1
3,Albania,International Movies,1
4,Algeria,Classic Movies,1
...,...,...,...
1403,West Germany,Thrillers,1
1404,Zimbabwe,Comedies,1
1405,Zimbabwe,Documentaries,2
1406,Zimbabwe,International Movies,3


In [None]:
# Visualization
fig = px.treemap(genre_country_counts, path=['country', 'genre'], values='num_shows', title='Dominant Genre per Country')
fig.show()

Observations:
- "International Movies" appear among the top genres for most countries outside the USA which makes sense according to the definition of "International Movies".
- We also notice country-specific genres such as "British TV shows" for the UK, "Anime" for Japan and "Korean TV Shows" for South Korea.
- "Dramas" and "Comedies" appear as common genres in most countries.

## 4. Catalogue Seasonality

In [33]:
# Creating a dataframe to count the number of releases per day
releases_over_time = netflix['date_added'].value_counts().reset_index()
releases_over_time.columns = ['date_added', 'num_releases']
releases_over_time = releases_over_time.sort_values(by='date_added')

In [35]:
# Plot a line graph with the number of releases over time
fig = go.Figure(
    data = go.Scatter(
        x = releases_over_time['date_added'], 
        y = releases_over_time['num_releases']
       ),
    layout = go.Layout(
        title = go.layout.Title(text = "Netflix releases over time", x = 0.5),
        xaxis = go.layout.XAxis(title = 'Date when show was added to the platform', rangeslider = go.layout.xaxis.Rangeslider(visible = True)),
        yaxis = go.layout.YAxis(title = 'Number of releases')
    )
)

fig.show(renderer="notebook_connected")

Observations:
- It looks like most releases happen the first of each month. 
- We have data starting from 2008 and we can see that the number of shows added to the platform started to grow significantly from 2016 onwards, which corresponds to the time when Netflix expanded to 130 new countries to reach a total of 190 countries. (source: https://about.netflix.com/en). According to the history of Netflix, they started their streaming service in 2007.
- We will look further into seasonality by looking at number of releases per year, month, day of month and day of week.

In [36]:
# Data processing
releases_over_time['year'] = releases_over_time['date_added'].dt.year.astype('Int64') # converting to integer as the default is float due to na values
releases_over_time['month'] = releases_over_time['date_added'].dt.month_name() # getting the month name
releases_over_time['day']  = releases_over_time['date_added'].dt.day.astype('Int64') # converting to integer as the default is float due to na values
releases_over_time['day_of_week'] = releases_over_time['date_added'].dt.day_name() # getting the day of week name
releases_over_time.head()

Unnamed: 0,date_added,num_releases,year,month,day,day_of_week
1420,2008-01-01,1,2008,January,1,Tuesday
1381,2008-02-04,1,2008,February,4,Monday
1331,2009-05-05,1,2009,May,5,Tuesday
1422,2009-11-18,1,2009,November,18,Wednesday
1330,2010-11-01,1,2010,November,1,Monday


### 4.1. Seasonality per year

In [45]:
# Seasonality per year
releases_per_year = releases_over_time.groupby('year')['num_releases'].sum().reset_index()
releases_per_year.columns = ['year', 'num_releases']
releases_per_year
fig = px.line(releases_per_year, x='year', y='num_releases', text='year', title='Total number of releases per year')
fig.update_layout(xaxis_title='Release Year', yaxis_title='Number of releases', autosize=False)
fig.update_traces(textposition="bottom right")
fig.show()

### 4. 2. Seasonality per month

In [49]:
# Data processing
releases_per_month_year = releases_over_time.groupby(['year', 'month'])['num_releases'].sum().reset_index()
releases_per_month = releases_per_month_year.groupby('month')['num_releases'].mean().reset_index()
releases_per_month['num_releases'] = releases_per_month['num_releases'].astype('int')

# Visualization
fig = px.bar(releases_per_month, x='month', y='num_releases', 
       category_orders={'month': calendar.month_name[1:]}, title='Average number of releases per month')
fig.update_layout(xaxis_title='Release Month', yaxis_title='Avg. Number of releases', autosize=False)
fig.show()

### 4.3. Seasonality per day of month

In [48]:
releases_per_day= releases_over_time.groupby(['day'])['num_releases'].mean().reset_index()
releases_per_day['num_releases'] = releases_per_day['num_releases'].astype('int')
fig = px.bar(releases_per_day, x='day', y='num_releases', title='Average number of releases per day of month')
fig.update_layout(xaxis_title='Release Day of Month', yaxis_title='Avg. number of releases', autosize=False)
fig.show()

### 4.4. Seasonality per day of week

In [51]:
releases_per_day_of_week = releases_over_time.groupby(['day_of_week'])['num_releases'].mean().reset_index()
releases_per_day_of_week['num_releases'] = releases_per_day_of_week['num_releases'].astype('int')
fig = px.bar(releases_per_day_of_week, x='day_of_week', y='num_releases', title='Average number of releases per day of week',
       category_orders= {'day_of_week': calendar.day_name[0:]})
fig.update_layout(xaxis_title='Release Day of Week', yaxis_title='Avg. Number of releases', autosize=False)
fig.show()

We can say from above graphs:
- Number of releases has increased over the years until 2020 when it started to decline, mostly due to covid crisis.
- There is no clear pattern in the number of releases per month. We can just say that the month with the most releases on average is July, and the one with the least releases on average is February.
- Most releases happen on the first of the month. Then, there are two other smaller peaks on the 15th of the month and at the end of the month. By releasing content of the same day each month, it makes it easier for subscribers to remember when new content will be available on the plateform. Also, it's probably part of a scheduling strategy from Netflix where they decide what new content should be published each month.
- Most releases are done on Fridays. This could be a strategy from Netflix to get people to watch new content during the week-end.