In [1]:
import pandas as pd
import numpy as np

# Load Dataset
df = pd.read_csv("netflix_titles.csv")
df.head()

# 1. Standardize Column Names

df.columns = df.columns.str.lower().str.replace(" ", "_")
df.columns

# 2. Handle Missing Values

df[['director','cast','country']] = df[['director','cast','country']].fillna("Unknown")

# Convert date_added to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

df.isnull().sum()

# 3. Remove Duplicates

df.drop_duplicates(inplace=True)
df.shape

# 4. Standardize Text Columns

text_cols = ['type','title','director','cast','country','rating','listed_in','description']

for col in text_cols:
    df[col] = df[col].astype(str).str.strip()

df.head()

# 5. Split Duration Column

df[['duration_int','duration_unit']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')

df[['duration','duration_int','duration_unit']].head()

# 6. Save Cleaned Dataset

df.to_csv("netflix_cleaned.csv", index=False)

print("Cleaning completed! File saved as netflix_cleaned.csv")


Cleaning completed! File saved as netflix_cleaned.csv
