In [1]:
# Step 1: Import required libraries
import pandas as pd

# Step 2: Load the dataset
file_path = "/content/netflix_titles.csv"  # path for your uploaded file
df = pd.read_csv(file_path)

# Step 3: Initial inspection
print("Initial Shape:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())

# Step 4: Remove duplicate rows
df = df.drop_duplicates()

# Step 5: Handle missing values
# Example strategy: Drop rows where essential fields are missing
df = df.dropna(subset=['title'])  # keep rows where 'title' is not null
# Fill other missing fields (example: fill missing 'country' with 'Unknown')
df['country'] = df['country'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Not Available')
df['director'] = df['director'].fillna('Not Available')
df['rating'] = df['rating'].fillna('Not Rated')

# Step 6: Standardize text fields
df['type'] = df['type'].str.strip().str.lower()
df['country'] = df['country'].str.strip().str.title()
df['rating'] = df['rating'].str.strip().str.upper()

# Step 7: Convert dates to consistent format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Step 8: Clean column headers
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Step 9: Check & fix data types (example: ensure correct types)
# 'release_year' should be int, 'date_added' should be datetime (already converted)
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype("Int64")

# Step 10: Final check
print("\nFinal Shape:", df.shape)
print("\nFinal Data Types:\n", df.dtypes)

# Step 11: Save cleaned data
df.to_csv("cleaned_netflix_titles.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_netflix_titles.csv'")


Initial Shape: (8807, 12)

Missing values:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

Duplicate Rows: 0

Final Shape: (8807, 12)

Final Data Types:
 show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             Int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

✅ Cleaned dataset saved as 'cleaned_netflix_titles.csv'
