# 03 - Model Training

This notebook trains a **RandomForestRegressor** to predict AQI using key pollutants.

In [1]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

PROC_PATH = '../data/processed_air_quality.csv'
MODEL_DIR = '../model'
MODEL_PATH = os.path.join(MODEL_DIR, 'aqi_model.pkl')

# Load processed dataset
df = pd.read_csv(PROC_PATH)
print('Loaded processed data:', df.shape)

FEATURES = ['PM2.5','PM10','NO2','SO2','CO','O3']
for f in FEATURES:
    if f not in df.columns:
        df[f] = 0.0

df = df.fillna(0)
X = df[FEATURES]
y = df['AQI']

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Evaluate quickly
pred_val = model.predict(X_val)
mae = mean_absolute_error(y_val, pred_val)
r2 = r2_score(y_val, pred_val)
print(f'MAE: {mae:.2f} | R^2: {r2:.3f}')

# Save model
os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(model, MODEL_PATH)
print('✅ Model saved to', MODEL_PATH)

Loaded processed data: (29531, 18)
MAE: 23.30 | R^2: 0.877
✅ Model saved to ../model\aqi_model.pkl


➡️ Proceed to **04_evaluation.ipynb** for detailed evaluation and visuals.