In [1]:
import pandas as pd
import numpy as np
import warnings
from statsmodels.tsa.exponential_smoothing.ets import ETSModel
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

Load AQI data

In [2]:
aqi_df = pd.read_csv('data/processed/cleaned/cleaned_air.csv', index_col=0, parse_dates=True)
aqi_df.index.freq = "h"

## Evaluation Metrics

- sMAPE (Symmetric Mean Absolute Percentage Error)

- MASE (Mean Absolute Scaled Error)

In [3]:
def smape(actual, predicted):
    """Symmetric MAPE - avoids division by zero."""
    actual = np.array(actual)
    predicted = np.array(predicted)
    denominator = np.abs(actual) + np.abs(predicted)
    denominator = np.where(denominator == 0, 1, denominator)
    return np.mean(2 * np.abs(actual - predicted) / denominator)

def mase_h_step(actual, predicted, train, h):
    """MASE scaled by h-step naive error."""
    mae = mean_absolute_error(actual, predicted)
    naive_errors = np.abs(train.values[h:] - train.values[:-h])
    naive_mae = np.mean(naive_errors)
    return mae / naive_mae if naive_mae > 0 else np.nan

## ETS Parameter Search

In [4]:
error_types = ['add', 'mul']
trends = ['add', 'mul', None]
seasonals = ['add', 'mul', None]
ets_params = []

for e in error_types:
    for t in trends:
        for s in seasonals:
            ets_params.append((e, t, s))

In [5]:
aicc_table = {}
aqi_comps = aqi_df.columns

# Suppress warnings during grid search (expected for invalid param combos)
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    for comp in aqi_comps:
        print(f"Searching params for {comp}...")
        aicc_list = {}
        for e, t, s in ets_params:
            try:
                model = ETSModel(aqi_df[comp], error=e, trend=t, seasonal=s, damped_trend=False, seasonal_periods=24)
                fit = model.fit(disp=False)
                aicc_list[f"{e}, {t}, {s}".replace('None', 'N').replace('add', 'A').replace('mul', 'M')] = fit.aicc
            except:
                continue
                
        aicc_table[comp] = aicc_list

print("\nParameter search complete!")

Searching params for carbon_monoxide...
Searching params for pm10...
Searching params for pm2_5...
Searching params for nitrogen_dioxide...
Searching params for ozone...
Searching params for sulphur_dioxide...

Parameter search complete!


In [6]:
aicc_table = pd.DataFrame(aicc_table)
aicc_table[aicc_table < 0] = np.nan
aicc_table

Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
"A, A, A",374055.107609,210249.895059,197501.197888,185873.492679,228218.302956,145252.395607
"A, A, M",417614.440259,302339.408908,319545.301788,,,220952.768853
"A, A, N",380529.235452,207496.006582,195111.069191,196038.250661,216643.856147,151913.714848
"A, M, A",486123.237372,,261173.710753,,,243816.548901
"A, M, M",365021.055952,,196609.131389,,,
"A, M, N",380717.802916,350458.910293,336131.276659,,,151988.46496
"A, N, A",374042.743901,210191.846897,197449.612368,180303.575151,223554.923637,143972.110871
"A, N, M",363506.540294,209713.974753,188057.802364,,,152238.173977
"A, N, N",380508.583372,207432.151429,195054.833996,196005.820262,245432.786704,151883.548587
"M, A, A",567131.317718,395024.300683,326417.044266,,,345443.370822


In [7]:
best_params = {}
ets_params = aicc_table.index
for comp in aqi_comps:
    best_params[comp] = ets_params[aicc_table[comp].argmin()]

best_params

{'carbon_monoxide': 'M, N, M',
 'pm10': 'A, N, N',
 'pm2_5': 'A, N, M',
 'nitrogen_dioxide': 'A, N, A',
 'ozone': 'A, A, N',
 'sulphur_dioxide': 'A, N, A'}

In [8]:
ets_name = {'A': 'add', 'N': None, 'M': 'mul'}
for comp in aqi_comps:
    param = best_params[comp].split(', ')
    for i in range(len(param)):
        param[i] = ets_name[param[i]]
    best_params[comp] = param
    
print("Best parameters per component:")
best_params

Best parameters per component:


{'carbon_monoxide': ['mul', None, 'mul'],
 'pm10': ['add', None, None],
 'pm2_5': ['add', None, 'mul'],
 'nitrogen_dioxide': ['add', None, 'add'],
 'ozone': ['add', 'add', None],
 'sulphur_dioxide': ['add', None, 'add']}

## Model Cross-Validation

Evaluate at multiple forecast horizons: 1, 3, and 6 hours ahead

In [9]:
splits = 5
horizons = [1, 3, 6]
max_horizon = max(horizons)
tscv = TimeSeriesSplit(n_splits=splits, test_size=max_horizon)

In [10]:
results_by_horizon = {h: {comp: {'smape': [], 'mase': []} for comp in aqi_comps} for h in horizons}

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    for comp in aqi_comps:
        e, t, s = best_params[comp]
        
        for train_index, val_index in tscv.split(aqi_df):
            train = aqi_df[comp].iloc[train_index]
            val = aqi_df[comp].iloc[val_index]
            
            model = ETSModel(train, error=e, trend=t, seasonal=s)
            fit = model.fit(disp=False)
            forecast = fit.forecast(steps=max_horizon)
            
            for h in horizons:
                val_h = val.iloc[:h]
                forecast_h = forecast.iloc[:h]
                
                smape_val = smape(val_h.values, forecast_h.values)
                mase_val = mase_h_step(val_h.values, forecast_h.values, train, h)
                
                results_by_horizon[h][comp]['smape'].append(smape_val)
                results_by_horizon[h][comp]['mase'].append(mase_val)

print("Cross-validation complete!")

Cross-validation complete!


In [11]:
for h in horizons:
    print(f"\nHorizon: {h} hour(s)")
    smape_means = {comp: np.mean(results_by_horizon[h][comp]['smape']) for comp in aqi_comps}
    mase_means = {comp: np.mean(results_by_horizon[h][comp]['mase']) for comp in aqi_comps}
    results = pd.DataFrame({'smape': smape_means, 'mase': mase_means}).T
    display(results)


Horizon: 1 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.186354,0.069368,0.073427,0.171948,0.406703,0.07846
mase,2.677633,0.92143,1.082446,0.912068,0.40048,1.51807



Horizon: 3 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.323477,0.110365,0.118335,0.46043,0.627619,0.15232
mase,1.653008,0.538874,0.529552,0.720624,0.329619,1.217032



Horizon: 6 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.355463,0.251841,0.255574,0.625284,0.862894,0.235588
mase,1.296305,0.677532,0.729241,0.781121,0.507929,1.319193


## Fit Model on Full Training Data

In [12]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    for comp in aqi_comps:
        e, t, s = best_params[comp]
        model = ETSModel(aqi_df[comp], error=e, trend=t, seasonal=s)
        fit = model.fit(disp=False)
        fit.save(f"models/ets/{comp}.pickle")
        print(f"Saved {comp} model with params (error={e}, trend={t}, seasonal={s})")

Saved carbon_monoxide model with params (error=mul, trend=None, seasonal=mul)
Saved pm10 model with params (error=add, trend=None, seasonal=None)
Saved pm2_5 model with params (error=add, trend=None, seasonal=mul)
Saved nitrogen_dioxide model with params (error=add, trend=None, seasonal=add)
Saved ozone model with params (error=add, trend=add, seasonal=None)
Saved sulphur_dioxide model with params (error=add, trend=None, seasonal=add)
