In [5]:
import pandas as pd

# loading the dataset
df = pd.read_csv("Netflix_movies_and_tv_shows_clustering.csv")

# checking missing values
df.isnull().sum()

# filling missing text values
text_cols = df.select_dtypes(include='object').columns
df[text_cols] = df[text_cols].fillna("Unknown")

# filling missing numeric values
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# removing duplicate rows
df = df.drop_duplicates()

# cleaning text columns (lowercase + removing extra spaces)
for col in text_cols:
    df[col] = df[col].str.lower().str.strip()

# standardizing country values
if 'country' in df.columns:
    df['country'] = df['country'].replace({
        'united states of america': 'usa',
        'united states': 'usa',
        'us': 'usa'
    })

# converting date column to uniform format
if 'date_added' in df.columns:
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['date_added'] = df['date_added'].dt.strftime('%d-%m-%Y')

# renaming column headers cleanly
df.columns = (
    df.columns
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

# converting numeric-looking object columns to integers
for col in df.columns:
    if df[col].dtype == 'object':
        if df[col].str.isnumeric().all():
            df[col] = df[col].astype(int)

# converting age column to int if present
if 'age' in df.columns:
    df['age'] = pd.to_numeric(df['age'], errors='coerce').fillna(0).astype(int)

# saving the cleaned dataset
df.to_csv("cleaned_data.csv", index=False)


  df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
