# Data Preprocessing

In [2]:
import pandas as pd # Importing the required module

url = 'https://github.com/silviaazahro/Netflix-/raw/main/Netflix%20Movies.csv' # URL of the data
df = pd.read_csv(url)

print(df.shape) # Print the shape of data
df.head(5) # See the first five rows of data

(9957, 9)


Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413


In [3]:
df.info() # The information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  6504 non-null   object 
 3   duration     7921 non-null   object 
 4   genre        9884 non-null   object 
 5   rating       8784 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        8784 non-null   object 
dtypes: float64(1), object(8)
memory usage: 700.2+ KB


In [7]:
# Only takes the required columns
data = df[['title', 'year', 'certificate', 'duration', 'genre', 'rating', 'votes']]
data.head(5) # See the first five rows of data

Unnamed: 0,title,year,certificate,duration,genre,rating,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,15413


In [8]:
data.isnull().sum() # Missing values checking

Unnamed: 0,0
title,0
year,527
certificate,3453
duration,2036
genre,73
rating,1173
votes,1173


In [9]:
# Fill the missing value in the 'year' with 0
data.loc[:,'year'] = data['year'].fillna(0)

# Fill the missing value in the 'certificate' with 'Unknown'
data.loc[:,'certificate'] = data['certificate'].fillna('Unknown')

# Fill the missing value in the 'duration' with 0
data.loc[:,'duration'] = data['duration'].fillna(0)

# Fill the missing value in the 'genre' with 'Unknown'
data.loc[:,'genre'] = data['genre'].fillna('Unknown')

# Fill the missing value in the 'rating' with 0
data.loc[:,'rating'] = data['rating'].fillna(0)

# Fill the missing value in the 'votes' with 0
data.loc[:,'votes'] = data['votes'].fillna(0)

In [10]:
data.isnull().sum() # Checking whether the missing values have been handled

Unnamed: 0,0
title,0
year,0
certificate,0
duration,0
genre,0
rating,0
votes,0


In [11]:
data[data.duplicated()] # Duplication checking

Unnamed: 0,title,year,certificate,duration,genre,rating,votes
7768,Big Vape: The Incendiary Rise of Juul,0,Unknown,0,Documentary,0.0,0
8098,Top Gear,(2002– ),TV-PG,51 min,"Adventure, Comedy, Reality-TV",7.8,210
8170,Top Gear,(2002– ),TV-PG,60 min,"Adventure, Comedy, Reality-TV",7.9,201
8245,Bleach,(2004–2012),TV-14,24 min,"Animation, Action, Adventure",8.4,556
8461,Roman Empire,(2016–2019),TV-MA,0,"Documentary, Biography, Drama",7.2,210
...,...,...,...,...,...,...,...
9843,Hollywood Insider,(2018– ),Unknown,0,Talk-Show,0.0,0
9858,Chocolate,(2019–2020),Unknown,0,"Drama, Romance",8.0,14
9896,Chocolate,(2019–2020),Unknown,0,"Drama, Romance",8.7,11
9899,Chocolate,(2019–2020),Unknown,0,"Drama, Romance",8.9,12


In [12]:
# Handling duplicated data
cleaned_data = data.drop_duplicates()
print(cleaned_data.shape)
cleaned_data.head(5) # See the first five rows of data

(9846, 7)


Unnamed: 0,title,year,certificate,duration,genre,rating,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,15413


In [13]:
cleaned_data.info() # The information of cleaned_data

<class 'pandas.core.frame.DataFrame'>
Index: 9846 entries, 0 to 9956
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9846 non-null   object 
 1   year         9846 non-null   object 
 2   certificate  9846 non-null   object 
 3   duration     9846 non-null   object 
 4   genre        9846 non-null   object 
 5   rating       9846 non-null   float64
 6   votes        9846 non-null   object 
dtypes: float64(1), object(6)
memory usage: 615.4+ KB


In [14]:
# Convert 'votes' to integers
cleaned_data['votes'] = cleaned_data['votes'].astype(str).str.replace(',', '', regex=True).astype(int)
cleaned_data.info() # Check the updated data types

<class 'pandas.core.frame.DataFrame'>
Index: 9846 entries, 0 to 9956
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9846 non-null   object 
 1   year         9846 non-null   object 
 2   certificate  9846 non-null   object 
 3   duration     9846 non-null   object 
 4   genre        9846 non-null   object 
 5   rating       9846 non-null   float64
 6   votes        9846 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 615.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['votes'] = cleaned_data['votes'].astype(str).str.replace(',', '', regex=True).astype(int)


In [15]:
cleaned_data.head(5) # See the first five rows of data

Unnamed: 0,title,year,certificate,duration,genre,rating,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,15413


In [16]:
# Saving the data into CSV file
cleaned_data.to_csv('cleaned_data.csv', index=False)
print("DataFrame have been saved to 'cleaned_data.csv'")

DataFrame have been saved to 'cleaned_data.csv'
