In [77]:
import pandas as pd
import numpy as np

In [78]:
df = pd.read_csv(
    filepath_or_buffer="dataset/netflix_titles.csv"
)

In [79]:
# General information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [80]:
# I will fill with "Unknown" cast and country. Not too much missing information.
# Moreover, I am also filling with "Not Rated" rating column. 
df['cast'] = df['cast'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')
df['rating'] = df['rating'].fillna('Not Rated')

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [82]:
# Columns such as director, date_added and duration also will be filled with "Unknown"
# It will not change too much information in dataset, 
# yet the dataset will not be in NaN values
df['date_added'] = df['date_added'].fillna("Unknown")
df['director'] = df['director'].fillna("Unknown")
df['duration'] = df['duration'].fillna("Unknown")

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8807 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [84]:
# It seems that we are not going to work with descriptions,
# Thus, I am dropping it.
df = df.drop(columns='description', axis=0)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8807 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
dtypes: int64(1), object(10)
memory usage: 757.0+ KB


In [86]:
# Now, I will check unique values for specific column
df['type'].value_counts()

type
Movie      6131
TV Show    2676
Name: count, dtype: int64

In [87]:
df['country'].value_counts().head()

country
United States     2818
India              972
Unknown            831
United Kingdom     419
Japan              245
Name: count, dtype: int64

In [88]:
df['duration'].value_counts().head()

duration
1 Season     1793
2 Seasons     425
3 Seasons     199
90 min        152
94 min        146
Name: count, dtype: int64

### Working with `rating` column

In [89]:
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR',
       'Not Rated', 'TV-Y7-FV', 'UR'], dtype=object)

In [90]:
df['rating'].value_counts()

rating
TV-MA        3207
TV-14        2160
TV-PG         863
R             799
PG-13         490
TV-Y7         334
TV-Y          307
PG            287
TV-G          220
NR             80
G              41
TV-Y7-FV        6
Not Rated       4
NC-17           3
UR              3
66 min          1
74 min          1
84 min          1
Name: count, dtype: int64

In [91]:
# '74 min', '84 min' and '66 min' obviously are not ratings. 
# Let's replace them with "Not Rated".
duration_ratings = ['66 min', '74 min', '84 min']
df.loc[df['rating'].isin(duration_ratings), 'rating'] = 'Not Rated'

In [92]:
df['rating'].value_counts()

rating
TV-MA        3207
TV-14        2160
TV-PG         863
R             799
PG-13         490
TV-Y7         334
TV-Y          307
PG            287
TV-G          220
NR             80
G              41
Not Rated       7
TV-Y7-FV        6
NC-17           3
UR              3
Name: count, dtype: int64

In [93]:
# UR (Unrated), NR (Not Rated), and "Not Rated" 
# all mean essentially the same thing. Thus, I can replace UR and NR with just "Not Rated"
not_rated = ['UR', 'NR']
df.loc[df['rating'].isin(not_rated), 'rating'] = 'Not Rated'

In [94]:
df['rating'].value_counts()

rating
TV-MA        3207
TV-14        2160
TV-PG         863
R             799
PG-13         490
TV-Y7         334
TV-Y          307
PG            287
TV-G          220
Not Rated      90
G              41
TV-Y7-FV        6
NC-17           3
Name: count, dtype: int64

### Working with `country` column

In [96]:
df['country'].value_counts()

country
United States                                    2818
India                                             972
Unknown                                           831
United Kingdom                                    419
Japan                                             245
                                                 ... 
Russia, Spain                                       1
Croatia, Slovenia, Serbia, Montenegro               1
Japan, Canada                                       1
United States, France, South Korea, Indonesia       1
Canada, Mexico, Germany, South Africa               1
Name: count, Length: 749, dtype: int64

Those combined countries (like "Russia, Spain", "Croatia, Slovenia, Serbia, Montenegro") are problematic for analysis. For this reason, I will just keep first country as a primary country.

In [100]:
def take_first_country(source: str) -> str:
    source = source.split(",")
    return source[0]

In [102]:
take_first_country("Croatia, Slovenia, Serbia, Montenegro")

'Croatia'

In [103]:
df['country'] = df['country'].apply(func=take_first_country)

In [104]:
df['country'].value_counts()

country
United States     3211
India             1008
Unknown            831
United Kingdom     628
Canada             271
                  ... 
West Germany         1
Slovenia             1
Guatemala            1
Jamaica              1
Somalia              1
Name: count, Length: 87, dtype: int64

In [10]:
# Number of distinct shows
len(df['show_id'].unique())

8807

In [11]:
# Types of shows presented by Netflix
df['type'].unique()

array(['Movie', 'TV Show'], dtype=object)

In [12]:
df['country'] # Country where the movie/show was produced

0       United States
1        South Africa
2                 NaN
3                 NaN
4               India
            ...      
8802    United States
8803              NaN
8804    United States
8805    United States
8806            India
Name: country, Length: 8807, dtype: object

In [13]:
df['description']

0       As her father nears the end of his life, filmm...
1       After crossing paths at a party, a Cape Town t...
2       To protect his family from a powerful drug lor...
3       Feuds, flirtations and toilet talk go down amo...
4       In a city of coaching centers known to train I...
                              ...                        
8802    A political cartoonist, a crime reporter and a...
8803    While living alone in a spooky town, a young g...
8804    Looking to survive in a world taken over by zo...
8805    Dragged from civilian life, a former superhero...
8806    A scrappy but poor boy worms his way into a ty...
Name: description, Length: 8807, dtype: object

In [14]:
# It seems that we are not going to work with descriptions,
# Thus, I am dropping it.
df = df.drop(columns='description', axis=0)

In [15]:
df['listed_in'].unique()

array(['Documentaries', 'International TV Shows, TV Dramas, TV Mysteries',
       'Crime TV Shows, International TV Shows, TV Action & Adventure',
       'Docuseries, Reality TV',
       'International TV Shows, Romantic TV Shows, TV Comedies',
       'TV Dramas, TV Horror, TV Mysteries', 'Children & Family Movies',
       'Dramas, Independent Movies, International Movies',
       'British TV Shows, Reality TV', 'Comedies, Dramas',
       'Crime TV Shows, Docuseries, International TV Shows',
       'Dramas, International Movies',
       'Children & Family Movies, Comedies',
       'British TV Shows, Crime TV Shows, Docuseries',
       'TV Comedies, TV Dramas', 'Documentaries, International Movies',
       'Crime TV Shows, Spanish-Language TV Shows, TV Dramas',
       'Thrillers',
       'International TV Shows, Spanish-Language TV Shows, TV Action & Adventure',
       'International TV Shows, TV Action & Adventure, TV Dramas',
       'Comedies, International Movies',
       'Comedies, 

In [31]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"TV Dramas, TV Mysteries"
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, TV Action & Adventure"
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV"
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"Romantic TV Shows, TV Comedies"


In [16]:
# Why am I going to remove "International TV Shows/Movie" label from "listed_in"?
# "International" doesn't add meaningful information 
# since Netflix is a global platform by default.

# The label doesn't describe the actual genre/content type 
# (comedy, drama, thriller, etc.)

# A TV show being "International" doesn't tell you what kind of show it actually is

# Some entries have it, some don't, creating inconsistency. 

In [37]:
def remove_label(source: str) -> None:
    source = source \
                  .replace("International TV Shows", "") \
                  .replace("International Movies", "") \
                  .replace(", ,", ",") \
                  .strip(", ")
    return source

In [38]:
df['listed_in'] = df['listed_in'].apply(remove_label)

In [39]:
df['listed_in']

0                                Documentaries
1                      TV Dramas, TV Mysteries
2        Crime TV Shows, TV Action & Adventure
3                       Docuseries, Reality TV
4               Romantic TV Shows, TV Comedies
                         ...                  
8802            Cult Movies, Dramas, Thrillers
8803    Kids' TV, Korean TV Shows, TV Comedies
8804                   Comedies, Horror Movies
8805        Children & Family Movies, Comedies
8806                  Dramas, Music & Musicals
Name: listed_in, Length: 8807, dtype: object

In [40]:
# At this point, I suppose I have to split dataset into 2 distinct tables.
# First one will be about TV Shows, whereas second one will be on Movies  

In [41]:
tv_shows = df[df['type'] == 'TV Show']
movies = df[df['type'] == 'Movie']

In [42]:
tv_shows.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2676 entries, 1 to 8803
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       2676 non-null   object
 1   type          2676 non-null   object
 2   title         2676 non-null   object
 3   director      230 non-null    object
 4   cast          2326 non-null   object
 5   country       2285 non-null   object
 6   date_added    2666 non-null   object
 7   release_year  2676 non-null   int64 
 8   rating        2674 non-null   object
 9   duration      2676 non-null   object
 10  listed_in     2676 non-null   object
dtypes: int64(1), object(10)
memory usage: 250.9+ KB


In [43]:
# only 230 non-null values out of 2676 entries. 
# This makes the column(director) nearly useless for most analyses.
# Such high missingness makes statistical analysis unreliable.
# Besides this, by dropping this column
# I will simplify my dataset without losing valuable information

In [44]:
tv_shows = tv_shows.drop(columns='director', axis=0)

In [45]:
tv_shows.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2676 entries, 1 to 8803
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       2676 non-null   object
 1   type          2676 non-null   object
 2   title         2676 non-null   object
 3   cast          2326 non-null   object
 4   country       2285 non-null   object
 5   date_added    2666 non-null   object
 6   release_year  2676 non-null   int64 
 7   rating        2674 non-null   object
 8   duration      2676 non-null   object
 9   listed_in     2676 non-null   object
dtypes: int64(1), object(9)
memory usage: 230.0+ KB


In [30]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6131 entries, 0 to 8806
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6131 non-null   object
 1   type          6131 non-null   object
 2   title         6131 non-null   object
 3   director      5943 non-null   object
 4   cast          5656 non-null   object
 5   country       5691 non-null   object
 6   date_added    6131 non-null   object
 7   release_year  6131 non-null   int64 
 8   rating        6129 non-null   object
 9   duration      6128 non-null   object
 10  listed_in     6131 non-null   object
dtypes: int64(1), object(10)
memory usage: 574.8+ KB
