# 02 - Preprocessing

This notebook handles missing values, ensures required features, and writes a processed dataset for modeling.

In [1]:
import pandas as pd
import numpy as np

RAW_PATH = '../data/air_quality.csv'       # input
PROC_PATH = '../data/processed_air_quality.csv'  # output

# Load raw data
df = pd.read_csv(RAW_PATH)
df.columns = [c.strip() for c in df.columns]

print('Before cleaning:', df.shape)

# Ensure required columns exist; if missing, create with zeros
REQUIRED = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'AQI']
for col in REQUIRED:
    if col not in df.columns:
        df[col] = 0.0  # safe default
        print(f'Added missing column: {col} (filled with 0)')

# Parse datetime-like columns if available
date_cols = [c for c in df.columns if c.lower() in ['date','datetime','timestamp']]
if date_cols:
    time_col = date_cols[0]
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
    df['Year'] = df[time_col].dt.year
    df['Month'] = df[time_col].dt.month
else:
    time_col = None

# Clean string columns (e.g., City)
if 'City' in df.columns:
    df['City'] = df['City'].astype(str).str.strip().str.title()

# Handle missing numeric values: fill with 0 (matches training script)
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].fillna(0)

# Final column order (keep City/time if present)
cols_order = []
if 'City' in df.columns: cols_order.append('City')
if time_col: cols_order.append(time_col)
cols_order += ['PM2.5','PM10','NO2','SO2','CO','O3','AQI']
cols_order += [c for c in df.columns if c not in cols_order]  # append the rest
df = df[cols_order]

print('After cleaning:', df.shape)
df.to_csv(PROC_PATH, index=False)
print('✅ Preprocessing complete. Saved:', PROC_PATH)

Before cleaning: (250, 9)
After cleaning: (250, 11)
✅ Preprocessing complete. Saved: ../data/processed_air_quality.csv


➡️ Proceed to **03_model_training.ipynb** for model building and training.