# Data Cleaning Notebook
This notebook loads the raw dataset, inspects it, performs cleaning steps, and compares it with the provided cleaned dataset.

In [None]:
import pandas as pd
import numpy as np

# Load the datasets
raw = pd.read_csv('raw_dataset.csv')
cleaned = pd.read_csv('cleaned_dataset.csv')

raw.head()

In [None]:
# Inspect raw dataset
raw.info()
raw.describe(include='all')

In [None]:
# Check missing values
raw.isnull().sum().sort_values(ascending=False)

In [None]:
# Cleaning steps:
# 1. Fill numeric columns with median
for col in raw.select_dtypes(include='number').columns:
    raw[col].fillna(raw[col].median(), inplace=True)

# 2. Fill categorical columns with mode
for col in raw.select_dtypes(exclude='number').columns:
    if raw[col].isnull().any():
        raw[col].fillna(raw[col].mode()[0], inplace=True)

# 3. Drop duplicates
raw.drop_duplicates(inplace=True)

raw.head()

In [None]:
# Compare with provided cleaned dataset
print('Raw cleaned shape:', raw.shape)
print('Provided cleaned shape:', cleaned.shape)

# Check if numeric stats are close
np.allclose(raw.describe(), cleaned.describe(), equal_nan=True)

In [None]:
# Save the cleaned dataset
raw.to_csv('cleaned_dataset_final.csv', index=False)
print('Cleaned dataset saved as cleaned_dataset_final.csv')