# AQI Forecasting — Delhi (PM2.5)
End‑to‑end: EDA → features → RandomForest → forecast.

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from src.utils import parse_and_sort, add_time_features, make_supervised, train_valid_split_time
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

DATA='dataset/sample_delhi_aqi.csv'
TARGET='pm25'
HORIZON=1
MAX_LAG=14
TEST_SIZE=0.2

df = pd.read_csv(DATA)
df = parse_and_sort(df, 'date')
df.head()

In [None]:
df.set_index('date')['pm25'].plot(figsize=(10,3))
plt.title('PM2.5 over time')
plt.show()

In [None]:
df = add_time_features(df, 'date')
df = make_supervised(df, target=TARGET, max_lag=MAX_LAG, roll_windows=(7,14))
df[TARGET] = df[TARGET].shift(-HORIZON)
df = df.dropna().reset_index(drop=True)
train, valid = train_valid_split_time(df, TEST_SIZE)
X_cols=[c for c in df.columns if c not in ['date', TARGET]]
Xtr, ytr = train[X_cols], train[TARGET]
Xva, yva = valid[X_cols], valid[TARGET]

In [None]:
pre = ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy="median")),('scaler', StandardScaler(with_mean=False))]), X_cols)])
model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
from sklearn.pipeline import Pipeline
pipe = Pipeline([('pre', pre), ('model', model)])
pipe.fit(Xtr, ytr)
preds = pipe.predict(Xva)
mae = mean_absolute_error(yva, preds)
rmse = mean_squared_error(yva, preds, squared=False)
r2 = r2_score(yva, preds)
mae, rmse, r2

In [None]:
plt.figure(figsize=(10,3))
plt.plot(yva.values, label='Actual')
plt.plot(preds, label='Predicted')
plt.legend(); plt.title('Actual vs Predicted'); plt.show()

Run the CLI: `python src/train.py --data dataset/sample_delhi_aqi.csv --target pm25 --horizon 1`