# Hackathon Notebook

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

##### Loading the dataframe and exploring it

In [2]:
df = pd.read_csv('data/netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
# Checking the shape of the dataframe, and the types of data each column holds
print(df.info(),'\n\n', df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None 

 (8807, 12)


In [4]:
# Checking the amount of null values. Some columns if not that many can either be dropped or filled.
# If we don't plan on checking the popularity of some directors or performers, then we can drop the columns containing that data
# While not containing null values, the description column could be dropped. We'd only need it if we'd like to create some
# sort of a model for a recommendation system. - Can be further recommendations
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

##### Cleaning and Preprocessing

In [5]:
# Checking for duplicates
df.duplicated().sum()

0

In [7]:
# Converting the date_added to datetime format, and also creating new columns for each year and month the content was added in.
# The month column could be both as numeric or the name of the month.
# However, we first have to drop the rows containing null values. We do not know when Netflix added certain content to their platform.
# And dropping the missing values would not affect our analysis, as there are only 10 such rows.
df = df[df['date_added'].notnull()]

# converting the type to datetime and creating the new columns
df['date_added'] = pd.to_datetime(df['date_added'], format='%B %d, %Y', errors='coerce')
df['month_name'] = df['date_added'].dt.month_name(locale='en_US.utf8')
# need to specify int64, otherwise it will be either converted to float64, or int32.
# it depends on the version of numpy and pandas that's used on each machine.
df['month'] = df['date_added'].dt.month.astype(np.int64)
df['year'] = df['date_added'].dt.year.astype(np.int64)

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,month_name,month,year
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",September,9,2021
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",September,9,2021
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,September,9,2021
3,s4,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",September,9,2021
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,September,9,2021


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8709 entries, 0 to 8806
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8709 non-null   object        
 1   type          8709 non-null   object        
 2   title         8709 non-null   object        
 3   director      6168 non-null   object        
 4   cast          7892 non-null   object        
 5   country       7882 non-null   object        
 6   date_added    8709 non-null   datetime64[ns]
 7   release_year  8709 non-null   int64         
 8   rating        8705 non-null   object        
 9   duration      8706 non-null   object        
 10  listed_in     8709 non-null   object        
 11  description   8709 non-null   object        
 12  month_name    8709 non-null   object        
 13  month         8709 non-null   int64         
 14  year          8709 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(11)


In [9]:
# checking the types of ratings that we have
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [10]:
# we notice that we have a few fields filled with duration instead so we'll move those values to the column they belong to,
# which after a bit of exploration are the rows where director is Louis CK. And we'll fill the type column with Not Rated instead
df[df['director'] == 'Louis C.K.']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,month_name,month,year
5541,s5542,Movie,Louis C.K. 2017,Louis C.K.,Louis C.K.,United States,2017-04-04,2017,74 min,,Movies,"Louis C.K. muses on religion, eternal love, gi...",April,4,2017
5794,s5795,Movie,Louis C.K.: Hilarious,Louis C.K.,Louis C.K.,United States,2016-09-16,2010,84 min,,Movies,Emmy-winning comedy writer Louis C.K. brings h...,September,9,2016
5813,s5814,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,Louis C.K.,United States,2016-08-15,2015,66 min,,Movies,The comic puts his trademark hilarious/thought...,August,8,2016


In [None]:
df.loc[:,'duration'].fillna(df.rating, inplace=True)
df.loc[df['duration'] == df['rating'],'rating'] = 'Not Rated'

In [12]:
# we also have 'UR', 'NR' and NaNs. The first 2 mean the same thing so we'll replace all of those with 'Not Rated'
# and we will fill all the NaNs with 'Not Rated' as well since we don't have the ratings for that.
df.loc[df['rating'] == 'UR','rating'] = 'Not Rated'
df.loc[df['rating'] == 'NR','rating'] = 'Not Rated'

df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', 'Not Rated', nan, 'TV-Y7-FV'], dtype=object)

In [None]:
# besides filling all ratings with 'Not Rated', we'll fill the other columns with missing values with 'Unknown'
# except for tge country column where we'll fill it with the mode instead - which is the United States
df.loc[:,'rating'].fillna('Not Rated', inplace=True)
df.loc[:,'cast'].fillna('Unknown', inplace=True)
df.loc[:,'director'].fillna('Unknown', inplace=True)
df.loc[:,'country'].fillna(df['country'].mode()[0], inplace=True)

df['rating'].unique()

In [14]:
# Now we can also group the content for its intended/target audience based on the tv rating
# IMDB / Amazon / Motion Pictures all show which rating is intended for which audience
# So we'll create a new column and group the ratings. https://help.imdb.com/article/contribution/titles/certificates/GU757M8ZJ9ZPXB39
df.loc[df['rating'].isin(['PG-13','TV-14','PG']), 'target_audience'] = 'Teens (13+)'
df.loc[df['rating'].isin(['Not Rated','NC-17','R','TV-MA']), 'target_audience'] = 'Adult (17+)'
df.loc[df['rating'].isin(['G','TV-Y7-FV','TV-G','TV-Y7','TV-Y','TV-PG']), 'target_audience'] = 'Kids'

In [15]:
# given the duration column has both number of seasons and the length in minutes, we'll make a new
# column for the number of seasons, while the duration will only have the minutes in numbers
# we could try to use the method above, but it would be more work, so we'll use lambda instead to filter
# each value and assign it to its new column
df['seasons'] = df.apply(lambda x:x['duration'].split(' ')[0] if 'Season' in x['duration'] else '', axis = 1)
df['duration_min'] = df.apply(lambda x:x['duration'].split(' ')[0] if 'min' in x['duration'] else '', axis = 1)

# and we can also add the main genre of the content by selecting the first genre name from the 'listed_in' column
# we could split it for all the genres later if we wish to do so
df['genre'] = df['listed_in'].apply(lambda x:x.lstrip().split(', ')[0])

In [None]:
#checking again our dataframe and we can export it to a new file to have it saved
df.info()
df.to_csv('data/netflix_clean.csv')

### Analysis

##### Analysis of Movies and TV Shows

In [16]:
# saving the main colors we'll be using
color_set_1 = ['#E50914','#221F1F','#F5F5F1','#000000']
color_chart = ['#D3121C','#E06168','#B474DD','#45238F']
color_set_2 = ['#D3121C','#E06168','#FE656F','#FA99A6','#B474DD','#7F31A3','#6F2996','#45238F','#1F2086','#141754']
color_set_3 = ['#0B0F4A','#010105','#EDEDFF']

# setting the width we'll be using
wdt = 600

In [17]:
# types of content in the library
fig_types = px.histogram(df, x='type', color='type', color_discrete_sequence=color_chart, width=wdt, title='Content Library Distribution')
fig_types.update_layout(barcornerradius='30%', width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Content Types')

fig_types.show()

In [18]:
fig_rating = px.histogram(df, x='rating', color='target_audience', color_discrete_sequence=color_chart, title='Audience Type')
fig_rating.update_layout(barcornerradius='30%', width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Audience Types')
fig_rating.show()


In [19]:
months = df.copy()
months = months.sort_values(['month'])

In [20]:

fig_months = px.histogram(months, x='month_name', color='month_name', color_discrete_sequence=color_chart, title='Content Added by Month')
fig_months.update_layout(barcornerradius='30%', width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Months')

fig_months.show()

In [21]:
fig_year = px.histogram(df, x='year', color='year', title='Content Added Per Year', color_discrete_sequence=color_set_2)
fig_year.update_layout(barcornerradius='30%', height=500, width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Years', bargap=0.2, legend={'traceorder':'normal'})


In [22]:
shows = df[~df['seasons'].str.fullmatch('')]
shows['seasons'] = shows['seasons'].astype(float)
shows = shows.sort_values(['seasons'])
shows['seasons'] = shows['seasons'].astype(np.int64)

In [23]:
movies = df[~df['duration_min'].str.fullmatch('')]
movies['duration_min'] = movies['duration_min'].astype(float)
movies = movies.sort_values(['duration_min'])
movies['duration_min'] = movies['duration_min'].astype(np.int64)

movies.loc[movies['duration_min'] < 90, 'length'] = 'Less than 1.5 hours'
movies.loc[movies['duration_min'].between(90,150), 'length'] = 'Between 1.5 hours and 2.5 hours'
movies.loc[movies['duration_min'] > 150, 'length'] = 'More than 2.5 hours'

In [24]:
fig_movie_length = px.histogram(movies, x='length', title='Movies Duration', color='length', color_discrete_sequence=color_chart)
fig_movie_length.update_layout(barcornerradius='30%', height=500, width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Duration', bargap=0.2, legend={'traceorder':'normal'})
fig_movie_length.update_xaxes(visible=False)
fig_movie_length.show()

In [25]:
fig_show_length = px.histogram(shows, x='seasons', title='TV Shows Seasons', color='seasons', color_discrete_sequence=color_chart)
fig_show_length.update_layout(barcornerradius='30%', height=800, width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Seasons', bargap=0.2, legend={'traceorder':'normal'})
fig_show_length.update_xaxes(categoryorder='total descending')
fig_show_length.show()

In [26]:
fig_genre = px.histogram(df, y='genre', color_discrete_sequence=color_chart)

fig_genre.update_layout(barcornerradius='30%', height=800, width=800, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Seasons', bargap=0.2, legend={'traceorder':'normal'})
fig_genre.update_yaxes(categoryorder='total ascending')

In [27]:
top_shows = shows[['title', 'seasons']].copy()
top_shows = top_shows[-20:]

In [28]:
fig_top_shows = px.histogram(top_shows, x='title', y='seasons', color_discrete_sequence=color_chart, title='Largest number of Seasons')
fig_top_shows.update_layout(barcornerradius='30%',height=600, width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Seasons', bargap=0.2, legend={'traceorder':'normal'})
fig_top_shows.update_xaxes(categoryorder='total descending')

fig_top_shows.show()

In [29]:
top_movies = movies[['title', 'duration_min']].copy()
top_movies = top_movies[-10:]

In [30]:
fig_top_movies = px.histogram(top_movies, x='title', y='duration_min', color_discrete_sequence=color_chart, title='Top 10 Longest Movies')
fig_top_movies.update_layout(barcornerradius='30%',height=600, width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Movies', bargap=0.2, legend={'traceorder':'normal'})
fig_top_movies.update_xaxes(categoryorder='total descending')

fig_top_movies.show()

In [31]:
df2 = df.copy()

df2['day_number'] = df2['date_added'].dt.day_of_week.astype(np.int64)
df2['day'] = df2['date_added'].dt.day_name()
df2.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,description,month_name,month,year,target_audience,seasons,duration_min,genre,day_number,day
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,...,"As her father nears the end of his life, filmm...",September,9,2021,Teens (13+),,90.0,Documentaries,5,Saturday
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,...,"After crossing paths at a party, a Cape Town t...",September,9,2021,Adult (17+),2.0,,International TV Shows,4,Friday
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,2021-09-24,2021,TV-MA,1 Season,...,To protect his family from a powerful drug lor...,September,9,2021,Adult (17+),1.0,,Crime TV Shows,4,Friday
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,United States,2021-09-24,2021,TV-MA,1 Season,...,"Feuds, flirtations and toilet talk go down amo...",September,9,2021,Adult (17+),1.0,,Docuseries,4,Friday
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,...,In a city of coaching centers known to train I...,September,9,2021,Adult (17+),2.0,,International TV Shows,4,Friday


In [32]:
days = df2.copy()
days = days.sort_values(['day_number'])
fig_days = px.histogram(days, x='day', color='day', color_discrete_sequence=color_chart, title='Content Added by Day of Week')
fig_days.update_layout(barcornerradius='30%', width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Days')

fig_days.show()

In [33]:
# top_countries = days[['title', 'country']].copy()
top_countries = days.groupby(['country']).size().to_frame().sort_values([0], ascending = False).head(10).reset_index()

fig_countries = px.histogram(top_countries, x='country', y=0, color_discrete_sequence=color_chart, title='Top 10 Countries', color='country')
fig_countries.update_layout(barcornerradius='30%',height=600, width=600, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Countries', bargap=0.2, legend={'traceorder':'normal'})
fig_countries.update_yaxes(title='count')

fig_countries.show()

In [34]:
fig_rel = px.histogram(days, x=['year','type'], color='type', barmode='group', color_discrete_sequence=color_chart, title='Movies and TV Shows Release by Year')
fig_rel.update_layout(barcornerradius='30%',height=600, width=800, plot_bgcolor=color_set_3[1], paper_bgcolor=color_set_3[1],
                         font={'color':color_set_3[2]}, title={'x':0.5}, legend_title_text='Legend', bargap=0.2, legend={'traceorder':'normal'})
fig_rel.update_xaxes(categoryorder='total descending')

fig_rel.show()