In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

Load AQI data

In [2]:
aqi_df = pd.read_csv('data/processed/cleaned/cleaned_air.csv', index_col=0, parse_dates=True)
aqi_df.index.freq = "h"

## Evaluation Metrics

- sMAPE (Symmetric Mean Absolute Percentage Error)

- MASE (Mean Absolute Scaled Error)

In [3]:
def smape(actual, predicted):
    """Symmetric MAPE - avoids division by zero."""
    actual = np.array(actual)
    predicted = np.array(predicted)
    denominator = np.abs(actual) + np.abs(predicted)
    # Avoid 0/0 case
    denominator = np.where(denominator == 0, 1, denominator)
    return np.mean(2 * np.abs(actual - predicted) / denominator)

def mase_h_step(actual, predicted, train, h):
    """MASE scaled by h-step naive error.
    h-step naive: predict y[t+h] = y[t] (use value from h steps ago)
    """
    mae = mean_absolute_error(actual, predicted)
    # h-step naive errors from training set: |y[t] - y[t-h]|
    naive_errors = np.abs(train.values[h:] - train.values[:-h])
    naive_mae = np.mean(naive_errors)
    return mae / naive_mae if naive_mae > 0 else np.nan

## Find Optimal ARIMA Parameters (d, q)

- **p = 2**
- **d** is determined using the ADF test (stationarity check)
- **q** is found via grid search minimizing AIC

In [4]:
p = 2

def find_d(series, max_d=2):
    """Find the differencing order d using ADF test."""
    for d in range(max_d + 1):
        diff_series = series.diff(d).dropna() if d > 0 else series
        adf_result = adfuller(diff_series)
        if adf_result[1] < 0.05:
            return d
    return max_d

def find_best_q(series, p=2, d=0, q_range=range(0, 4)):
    """Grid search for optimal q using AIC, with fixed p."""
    best_aic = float('inf')
    best_q = 0
    for q in q_range:
        try:
            model = ARIMA(series, order=(p, d, q))
            fit = model.fit()
            if fit.aic < best_aic:
                best_aic = fit.aic
                best_q = q
        except:
            continue
    return best_q, best_aic

In [5]:
aqi_comps = aqi_df.columns
best_params = {}

for comp in aqi_comps:
    print(f"Finding parameters for {comp}...")
    d = find_d(aqi_df[comp])
    q, aic = find_best_q(aqi_df[comp], p=p, d=d)
    best_params[comp] = (p, d, q)
    print(f"  Best order: ({p}, {d}, {q}), AIC: {aic:.2f}")

print("\nOptimal parameters:")
best_params

Finding parameters for carbon_monoxide...
  Best order: (2, 0, 3), AIC: 348147.81
Finding parameters for pm10...
  Best order: (2, 0, 1), AIC: 194938.97
Finding parameters for pm2_5...
  Best order: (2, 0, 1), AIC: 182402.18
Finding parameters for nitrogen_dioxide...
  Best order: (2, 0, 3), AIC: 176549.96
Finding parameters for ozone...
  Best order: (2, 0, 3), AIC: 206434.37
Finding parameters for sulphur_dioxide...
  Best order: (2, 0, 3), AIC: 133112.48

Optimal parameters:


{'carbon_monoxide': (2, 0, 3),
 'pm10': (2, 0, 1),
 'pm2_5': (2, 0, 1),
 'nitrogen_dioxide': (2, 0, 3),
 'ozone': (2, 0, 3),
 'sulphur_dioxide': (2, 0, 3)}

## Model Cross-Validation

Evaluate at multiple forecast horizons: 1, 3, and 6 hours ahead

In [6]:
splits = 5
horizons = [1, 3, 6]
max_horizon = max(horizons)
tscv = TimeSeriesSplit(n_splits=splits, test_size=max_horizon)

In [7]:
results_by_horizon = {h: {comp: {'smape': [], 'mase': []} for comp in aqi_comps} for h in horizons}

for comp in aqi_comps:
    p, d, q = best_params[comp]
    
    for train_index, val_index in tscv.split(aqi_df):
        train = aqi_df[comp].iloc[train_index]
        val = aqi_df[comp].iloc[val_index]
        
        model = ARIMA(train, order=(p, d, q))
        fit = model.fit()
        forecast = fit.forecast(steps=max_horizon)
        
        for h in horizons:
            val_h = val.iloc[:h]
            forecast_h = forecast.iloc[:h]
            
            # sMAPE: symmetric, avoids division by zero
            smape_val = smape(val_h.values, forecast_h.values)
            # MASE: scaled by h-step naive error
            mase_val = mase_h_step(val_h.values, forecast_h.values, train, h)
            
            results_by_horizon[h][comp]['smape'].append(smape_val)
            results_by_horizon[h][comp]['mase'].append(mase_val)

print("Cross-validation complete!")

Cross-validation complete!


In [10]:
for h in horizons:
    print(f"\nHorizon: {h} hour(s)")
    smape_means = {comp: np.mean(results_by_horizon[h][comp]['smape']) for comp in aqi_comps}
    mase_means = {comp: np.mean(results_by_horizon[h][comp]['mase']) for comp in aqi_comps}
    results = pd.DataFrame({'smape': smape_means, 'mase': mase_means}).T
    display(results)


Horizon: 1 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.081049,0.140527,0.144938,0.176527,0.398586,0.029816
mase,1.513131,0.836583,1.063501,0.505287,0.396341,0.616697



Horizon: 3 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.203688,0.175719,0.187394,0.2909,0.481814,0.080662
mase,1.258919,0.394716,0.496598,0.362654,0.231137,0.719074



Horizon: 6 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.387123,0.226096,0.253336,0.439522,0.653583,0.18607
mase,1.556345,0.454637,0.593498,0.609158,0.330957,0.996941


## Fit Model on Full Training Data

In [9]:
for comp in aqi_comps:
    p, d, q = best_params[comp]
    model = ARIMA(aqi_df[comp], order=(p, d, q))
    fit = model.fit()
    fit.save(f"models/arima/{comp}.pickle")
    print(f"Saved {comp} model with order ({p}, {d}, {q})")

Saved carbon_monoxide model with order (2, 0, 3)
Saved pm10 model with order (2, 0, 1)
Saved pm2_5 model with order (2, 0, 1)
Saved nitrogen_dioxide model with order (2, 0, 3)
Saved ozone model with order (2, 0, 3)
Saved sulphur_dioxide model with order (2, 0, 3)
