In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

import plotly.express as px
import cufflinks as cf 

import warnings
warnings.filterwarnings("ignore")

### Load data

In [7]:
!pip install charset-normalizer



In [11]:
from charset_normalizer import from_path # to automatically check the type of encoding

result = from_path("../data/netflix_titles.csv")
print(result.best().encoding)

utf_8


In [12]:
df_og = pd.read_csv("../data/netflix_titles.csv", encoding= result.best().encoding)
df_og.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


In [13]:
df_og.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [14]:
df = df_og.copy()
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### Preliminary Exploratory Data Analysis

In [15]:
df.shape

(8807, 12)

In [16]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [18]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [19]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [20]:
df.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

### Handling missing data

In [21]:
df['director'] = df['director'].fillna('unknown')
df['cast'] = df['cast'].fillna('unknown')
df['country'] = df['country'].fillna('unknown')
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])

In [22]:
df['date_added_format'] = pd.to_datetime(df['date_added'], errors= 'coerce')

In [23]:
movie_rating = df[df['type'] == 'Movie']['rating'].mode()[0]
movie_rating

'TV-MA'

In [24]:
tv_rating = df[df['type'] == 'TV Show']['rating'].mode()[0]
tv_rating

'TV-MA'

In [25]:
df.loc[
    (df['type'] == 'Movie') & (df['rating'].isna()), 
    'rating'] = movie_rating

In [26]:
df.loc[
    (df['type'] == 'TV Show') & (df['rating'].isna()),
    'rating'
] = tv_rating

In [27]:
df['time'] = df['duration'].str.extract('(\d+)').astype(float) # (\d+) means “extract one or more consecutive digits”
df['time']

0        90.0
1         2.0
2         1.0
3         1.0
4         2.0
        ...  
8802    158.0
8803      2.0
8804     88.0
8805     88.0
8806    111.0
Name: time, Length: 8807, dtype: float64

In [28]:
df['Movie Time'] = df.loc[df['type'] == 'Movie', 'time']
df['TV time'] = df.loc[df['type'] == 'TV Show', 'time'] 

In [29]:
df.rename(columns={'Movie Time' : 'Movie Duration',
                  'TV time' : 'TV seasons'}, inplace= True)

In [30]:
df.drop(columns= 'time', inplace= True)

In [31]:
df.isna().sum()

show_id                 0
type                    0
title                   0
director                0
cast                    0
country                 0
date_added              0
release_year            0
rating                  0
duration                3
listed_in               0
description             0
date_added_format      88
Movie Duration       2679
TV seasons           6131
dtype: int64

In [32]:
df[['duration', 'Movie Duration', 'TV seasons']].sample(10)

Unnamed: 0,duration,Movie Duration,TV seasons
1692,2 Seasons,,2.0
6769,83 min,83.0,
7018,116 min,116.0,
7904,110 min,110.0,
2708,2 Seasons,,2.0
2281,129 min,129.0,
8653,4 Seasons,,4.0
5886,85 min,85.0,
3625,102 min,102.0,
5752,1 Season,,1.0


In [33]:
# since all the data in the TV seasons and Movie Duration match with duration, we can safely drop it
df.drop(columns='duration', inplace= True)

In [34]:
df['date_added'] = df['date_added_format']

In [35]:
df.drop(columns= 'date_added_format', inplace= True)

In [36]:
df.isna().sum()

show_id              0
type                 0
title                0
director             0
cast                 0
country              0
date_added          88
release_year         0
rating               0
listed_in            0
description          0
Movie Duration    2679
TV seasons        6131
dtype: int64

In [37]:
df.rename(columns={
    'Movie Duration' : 'movie_duration_min',
    'TV seasons' : 'tv_seasons'
}, inplace= True)

In [38]:
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,movie_duration_min,tv_seasons
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,unknown,United States,2021-09-25,2020,PG-13,Documentaries,"As her father nears the end of his life, filmm...",90.0,
1,s2,TV Show,Blood & Water,unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,2.0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",unknown,2021-09-24,2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,,1.0


In [39]:
df.to_csv('../data/netflix_titles_cleaned.csv', index= False)

In [45]:
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR',
       'TV-Y7-FV', 'UR'], dtype=object)

In [46]:
df['rating'].str.contains('min')

0       False
1       False
2       False
3       False
4       False
        ...  
8802    False
8803    False
8804    False
8805    False
8806    False
Name: rating, Length: 8807, dtype: bool

In [47]:
mask = df['rating'].isin(['74 min', '84 min', '66 min'])
df[mask]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,movie_duration_min,tv_seasons
5541,s5542,Movie,Louis C.K. 2017,Louis C.K.,Louis C.K.,United States,2017-04-04,2017,74 min,Movies,"Louis C.K. muses on religion, eternal love, gi...",,
5794,s5795,Movie,Louis C.K.: Hilarious,Louis C.K.,Louis C.K.,United States,2016-09-16,2010,84 min,Movies,Emmy-winning comedy writer Louis C.K. brings h...,,
5813,s5814,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,Louis C.K.,United States,2016-08-15,2015,66 min,Movies,The comic puts his trademark hilarious/thought...,,


In [48]:
df.loc[mask, 'duration'] = df.loc[mask, 'rating']

In [49]:
df.loc[mask, 'rating'] = np.nan

In [51]:
df.loc[mask, ['rating', 'duration']]

Unnamed: 0,rating,duration
5541,,74 min
5794,,84 min
5813,,66 min


In [65]:
df.shape

(8807, 14)

### extracting required data

In [73]:
# create new dataframe for all types of genres

genre_df = df.assign(
    genre = df['listed_in'].str.split(', ')
).explode('genre')

genre_df['genre'] = genre_df['genre'].str.strip()

In [72]:
genre_df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,movie_duration_min,tv_seasons,duration,genre
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,unknown,United States,2021-09-25,2020,PG-13,Documentaries,"As her father nears the end of his life, filmm...",90.0,,,Documentaries
1,s2,TV Show,Blood & Water,unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,2.0,,International TV Shows
1,s2,TV Show,Blood & Water,unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,2.0,,TV Dramas
1,s2,TV Show,Blood & Water,unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,2.0,,TV Mysteries
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",unknown,2021-09-24,2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,,1.0,,Crime TV Shows


### Analysis based on these questions