# 02. Data Cleaning & Preprocessing

Data quality assessment and preprocessing pipeline implementation.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
from src.data_loader import load_all_datasets
from src.preprocessing import (
    preprocess_all, get_data_quality_report,
    parse_dates, validate_pincode, normalize_state_names,
    add_temporal_features, add_enrolment_totals
)

In [None]:
enrolment_raw, demographic_raw, biometric_raw = load_all_datasets()

## Data Quality Reports

In [None]:
quality_reports = {
    'enrolment': get_data_quality_report(enrolment_raw, 'Enrolment'),
    'demographic': get_data_quality_report(demographic_raw, 'Demographic'),
    'biometric': get_data_quality_report(biometric_raw, 'Biometric'),
}

for name, report in quality_reports.items():
    print(f"\n{name.upper()}")
    print(f"  Total rows: {report['total_rows']:,}")
    print(f"  Duplicates: {report['duplicates']:,}")
    print(f"  Memory: {report['memory_mb']:.1f} MB")

## Step-by-Step Preprocessing

In [None]:
# Step 1: Parse dates
df = parse_dates(enrolment_raw.copy())
print(f"Date column dtype after parsing: {df['date'].dtype}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

In [None]:
# Step 2: Validate PIN codes
before_count = len(df)
df = validate_pincode(df)
after_count = len(df)
print(f"Records before PIN validation: {before_count:,}")
print(f"Records after PIN validation: {after_count:,}")
print(f"Removed: {before_count - after_count:,} ({(before_count - after_count)/before_count*100:.2f}%)")

In [None]:
# Step 3: Normalize state names
df = normalize_state_names(df)
print(f"Unique states: {df['state'].nunique()}")
df['state'].value_counts().head(10)

In [None]:
# Step 4: Add temporal features
df = add_temporal_features(df)
print("New columns:", ['year', 'month', 'quarter', 'day_of_week', 'month_name', 'week'])
df[['date', 'year', 'month', 'quarter', 'day_of_week', 'month_name']].head()

In [None]:
# Step 5: Add total enrolments
df = add_enrolment_totals(df)
df[['age_0_5', 'age_5_17', 'age_18_greater', 'total_enrolments']].head()

## Full Preprocessing Pipeline

In [None]:
enrolment, demographic, biometric = preprocess_all(
    enrolment_raw, demographic_raw, biometric_raw
)

print(f"Enrolment: {len(enrolment):,} records")
print(f"Demographic: {len(demographic):,} records")
print(f"Biometric: {len(biometric):,} records")

In [None]:
enrolment.info()

In [None]:
enrolment.head()