In [1]:
import pandas as pd


In [2]:
# Load the dataset
df = pd.read_csv('netflix_titles.csv')


In [3]:
# Step 1: Display missing values
print("Missing values per column:\n", df.isnull().sum())


Missing values per column:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [4]:
# Step 2: Handle missing values
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Not Specified', inplace=True)
df['country'].fillna('Unknown', inplace=True)
df['date_added'].fillna(df['date_added'].mode()[0], inplace=True)
df['rating'].fillna(df['rating'].mode()[0], inplace=True)
df['duration'].fillna('Unknown', inplace=True)


In [5]:
# Step 3: Remove duplicate rows
df.drop_duplicates(inplace=True)


In [6]:
# Step 4: Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')



In [7]:
# Step 5: Convert date_added to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')


In [8]:
# Step 6: Save the cleaned dataset
df.to_csv('netflix_titles_cleaned.csv', index=False)



In [9]:
# Step 7: Final check
print("\n✅ Data Cleaning Completed Successfully!")
print("Cleaned dataset saved as: netflix_titles_cleaned.csv")
print("\nUpdated Data Info:")
print(df.info())


✅ Data Cleaning Completed Successfully!
Cleaned dataset saved as: netflix_titles_cleaned.csv

Updated Data Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3   director      8807 non-null   object        
 4   cast          8807 non-null   object        
 5   country       8807 non-null   object        
 6   date_added    8807 non-null   datetime64[ns]
 7   release_year  8807 non-null   int64         
 8   rating        8807 non-null   object        
 9   duration      8807 non-null   object        
 10  listed_in     8807 non-null   object        
 11  description   8807 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 894.5+ KB
None
