In [65]:
import pandas as pd
import numpy as np

In [74]:
df = pd.read_csv('netflix_titles.csv')
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s366,Movie,Eyes of a Thief,Najwa Najjar,"Khaled Abol El Naga, Souad Massi, Suhail Hadda...",", France, Algeria",30-Jul-21,2014,TV-14,103 min,"Dramas, Independent Movies, International Movies","After a decade in prison, a Palestinian man wi..."
1,s194,TV Show,D.P.,,"Jung Hae-in, Koo Kyo-hwan, Kim Sung-kyun, Son ...",", South Korea",27-Aug-21,2021,TV-MA,1 Season,"International TV Shows, TV Dramas",A young private’s assignment to capture army d...
2,s276,TV Show,The Kingdom,,"Chino Darín, Nancy Dupláa, Joaquín Furriel, Pe...",Argentina,13-Aug-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, Spanis...","After his running mate's murder, a controversi..."


### Identifying and dealing with missing data
- In this case, I will:
  - Identify which columns have missing values
  - Replace blank countries with the mode (most common) country
  - I want to keep director as it could be interesting to look at a certain director's films
  - I want to keep cast as it could be interesting to look at a certain cast's films

In [75]:
# Identyfing which columns have missing data

for i in df.columns:
    null_rate = df[i].isna().sum() / len(df) * 100 
    if null_rate > 0 :
        print("{} null rate: {}%".format(i,round(null_rate,2)))

director null rate: 29.91%
cast null rate: 9.37%
country null rate: 9.44%
date_added null rate: 0.11%
rating null rate: 0.05%
duration null rate: 0.03%


In [77]:
#Replacements


df['country'] = df['country'].fillna(df['country'].mode()[0])

df['cast'].replace(np.nan, 'No Data',inplace  = True)
df['director'].replace(np.nan, 'No Data',inplace  = True)

#Drop columns with missing values
df.dropna(inplace=True)

#Drop Duplicates
df.drop_duplicates(inplace = True)


In [78]:
df.isnull().sum()


show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [None]:
df.info()

In [30]:
df.loc[0,'country']='France'
df.loc[1,'country']='South Korea'

### Dealing with dates
- I will convert the dates data type to date time  

In [38]:
#Converted the date data types to date time and created additional columns for month and year for additional analysis

df["date_added"] = pd.to_datetime(df['date_added'])
df['month_added']=df['date_added'].dt.month
df['month_name_added']=df['date_added'].dt.month_name()
df['year_added'] = df['date_added'].dt.year

df.head(1)


Unnamed: 0,show_id,type,title,director,cast,date_added,release_year,rating,duration,listed_in,description,month_added,month_name_added,year_added,target_ages,first_country
0,s366,Movie,Eyes of a Thief,Najwa Najjar,"Khaled Abol El Naga, Souad Massi, Suhail Hadda...",2021-07-30,2014,TV-14,103 min,"Dramas, Independent Movies, International Movies","After a decade in prison, a Palestinian man wi...",7.0,July,2021.0,Teens,France


In [62]:
df.set_index('show_id', inplace = True)

In [33]:
#Created categories for target age groups 

ratings_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}

df['target_ages'] = df['rating'].replace(ratings_ages)

df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,month_added,month_name_added,year_added,target_ages
0,s366,Movie,Eyes of a Thief,Najwa Najjar,"Khaled Abol El Naga, Souad Massi, Suhail Hadda...",France,2021-07-30,2014,TV-14,103 min,"Dramas, Independent Movies, International Movies","After a decade in prison, a Palestinian man wi...",7.0,July,2021.0,Teens


In [34]:
# Split the 'country' column using the ',' delimiter and create a new DataFrame
split_df = df['country'].str.split(',', expand=True)

# Assign the first element to a new column 'first_country'
df['first_country'] = split_df[0]

# Drop the original 'country' column if needed
df.drop(columns=['country'], inplace=True)

df.head(1)

Unnamed: 0,show_id,type,title,director,cast,date_added,release_year,rating,duration,listed_in,description,month_added,month_name_added,year_added,target_ages,first_country
0,s366,Movie,Eyes of a Thief,Najwa Najjar,"Khaled Abol El Naga, Souad Massi, Suhail Hadda...",2021-07-30,2014,TV-14,103 min,"Dramas, Independent Movies, International Movies","After a decade in prison, a Palestinian man wi...",7.0,July,2021.0,Teens,France


In [67]:
# Split the 'country' column using the ',' delimiter and create a new DataFrame
#split_df = df['listed_in'].str.split(',', expand=True)

# Assign the first element to a new column 'first_country'
#df['category1,category2'] = split_df[0]

# Drop the original 'country' column if needed
#df.drop(columns=['country'], inplace=True)

df.head(1)

Unnamed: 0_level_0,type,title,director,cast,date_added,release_year,rating,duration,listed_in,description,month_added,month_name_added,year_added,target_ages,first_country,movie_duration,tv_show_seasons
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
s366,Movie,Eyes of a Thief,Najwa Najjar,"Khaled Abol El Naga, Souad Massi, Suhail Hadda...",2021-07-30,2014,TV-14,103 min,"Dramas, Independent Movies, International Movies","After a decade in prison, a Palestinian man wi...",7.0,July,2021.0,Teens,France,103.0,


In [80]:
# Function to extract duration in minutes from the 'duration'
def extract_duration(row):
    if pd.notna(row['duration']) and 'min' in row['duration']:
        duration = row['duration'].split(' min')[0]
        return int(duration)
    return None

# Apply the function to create the 'movie_duration' column
df['movie_duration'] = df.apply(extract_duration, axis=1)

df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,movie_duration
0,s366,Movie,Eyes of a Thief,Najwa Najjar,"Khaled Abol El Naga, Souad Massi, Suhail Hadda...",", France, Algeria",30-Jul-21,2014,TV-14,103 min,"Dramas, Independent Movies, International Movies","After a decade in prison, a Palestinian man wi...",103.0


In [63]:
# Function to extract duration in minutes from the 'tv_show_seasons'
def extract_duration(row):
    if pd.notna(row['duration']) and 'Season' in row['duration']:
        duration = row['duration'].split(' Season')[0]
        return int(duration)
    return None

# Apply the function to create the 'movie_duration' column
df['tv_show_seasons'] = df.apply(extract_duration, axis=1)

df.head(1)

Unnamed: 0_level_0,type,title,director,cast,date_added,release_year,rating,duration,listed_in,description,month_added,month_name_added,year_added,target_ages,first_country,movie_duration,tv_show_seasons
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
s366,Movie,Eyes of a Thief,Najwa Najjar,"Khaled Abol El Naga, Souad Massi, Suhail Hadda...",2021-07-30,2014,TV-14,103 min,"Dramas, Independent Movies, International Movies","After a decade in prison, a Palestinian man wi...",7.0,July,2021.0,Teens,France,103.0,
