In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.exponential_smoothing.ets import ETSModel, ETSResults
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

Load AQI data

In [25]:
aqi_df = pd.read_csv('data/processed/cleaned/cleaned_air.csv')
aqi_df['time'] = pd.to_datetime(aqi_df['time'])
aqi_df.set_index('time', inplace=True)
aqi_df.index.freq = "h"

Define the search space for E, T, and S

In [26]:
error_types = ['add', 'mul']
trends = ['add', 'mul', None]
seasonals = ['add', 'mul', None]
ets_params = []

for e in error_types:
    for t in trends:
        for s in seasonals:
            ets_params.append((e, t, s))

Parameter search pipeline

In [27]:
aicc_table = {}
aqi_comps = aqi_df.columns

for comp in aqi_comps:
    aicc_list = {}
    for e, t, s in ets_params:
        try:
            model = ETSModel(aqi_df[comp], error=e, trend=t, seasonal=s, damped_trend=False, seasonal_periods=24)
            fit = model.fit(disp=False)
            aicc_list[f"{e}, {t}, {s}".replace('None', 'N').replace('add', 'A').replace('mul', 'M')] = fit.aicc
        except:
            # Some combinations (like multiplicative on negative data) are invalid
            continue
            
    aicc_table[comp] = aicc_list

  return (data - yhat) / yhat
  return (data - yhat) / yhat
  logL = -self.nobs / 2 * (np.log(2 * np.pi * np.mean(res ** 2)) + 1)
  yhat[yhat <= 0] = 1 / (1e-8 * (1 + np.abs(yhat[yhat <= 0])))
  logL -= np.sum(np.log(yhat))
  logL -= np.sum(np.log(yhat))
  logL = -self.nobs / 2 * (np.log(2 * np.pi * np.mean(res ** 2)) + 1)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  logL = -self.nobs / 2 * (np.log(2 * np.pi * np.mean(res ** 2)) + 1)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Choose the model with the lowest AICc

In [28]:
aicc_table = pd.DataFrame(aicc_table)
aicc_table[aicc_table < 0] = np.nan
aicc_table

Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
"A, A, A",374055.107609,210249.895059,197501.197945,185873.492681,228218.302954,145252.395601
"A, A, M",418095.540726,305186.357104,322221.12297,,,220944.840613
"A, A, N",380529.235452,207496.006582,195111.069199,196038.250661,216643.862071,151913.714848
"A, M, A",486123.237359,275566.372723,281338.357591,,,157493.387001
"A, M, M",364962.339749,306054.93586,,,,
"A, M, N",380717.802916,350458.910275,336131.276657,,,151988.46496
"A, N, A",374042.743901,210191.846897,197449.612363,180303.56139,223554.96977,149099.844118
"A, N, M",363507.586617,209713.974359,188064.910173,,,152238.122551
"A, N, N",380508.583372,207432.151429,195054.833996,196005.820262,245432.786704,151883.548587
"M, A, A",540893.546445,424075.131296,322845.18578,,,354195.980515


In [29]:
best_params = {}
ets_params = aicc_table.index
for comp in aqi_comps:
    best_params[comp] = ets_params[aicc_table[comp].argmin()]

best_params

{'carbon_monoxide': 'M, N, M',
 'pm10': 'A, N, N',
 'pm2_5': 'A, N, M',
 'nitrogen_dioxide': 'A, N, A',
 'ozone': 'A, A, N',
 'sulphur_dioxide': 'A, A, A'}

In [30]:
ets_name = {'A': 'add', 'N': None, 'M': 'mul'}
for comp in aqi_comps:
    param = best_params[comp].split(', ')
    for i in range(len(param)):
        param[i] = ets_name[param[i]]
    best_params[comp] = param
    
best_params

{'carbon_monoxide': ['mul', None, 'mul'],
 'pm10': ['add', None, None],
 'pm2_5': ['add', None, 'mul'],
 'nitrogen_dioxide': ['add', None, 'add'],
 'ozone': ['add', 'add', None],
 'sulphur_dioxide': ['add', 'add', 'add']}

Model cross-validation

In [35]:
splits = 5
val_len = 24
tscv = TimeSeriesSplit(n_splits=splits, test_size=val_len)

In [None]:
mape_list = {}
mase_list = {}

for comp in aqi_comps:
    mape_scores = []
    mase_scores = []
    
    for train_index, val_index in tscv.split(aqi_df):
        train, val = aqi_df[comp].iloc[train_index], aqi_df[comp].iloc[val_index]
        e, t, s = best_params[comp]
        model = ETSModel(train, error=e, trend=t, seasonal=s)
        fit = model.fit(disp=False)
        
        forecast = fit.forecast(steps=len(val))
        
        mape = mean_absolute_percentage_error(val, forecast)
        if mape > 5:
            continue
        mae = mean_absolute_error(val, forecast)
        # Naive prediction: use value at t to guess value at t + len(val) 
        naive_mae = np.mean(np.abs(train[-len(val):].values - val.values))
        mase = mae / naive_mae
        mape_scores.append(mape)
        mase_scores.append(mase)
    
    mape_list[comp] = np.mean(mape_scores)
    mase_list[comp] = np.mean(mase_scores)

In [38]:
results = pd.DataFrame({'mape': mape_list, 'mase': mase_list}).T
results

Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
mape,1.030722,0.953096,0.752392,0.943897,3.175382,0.48839
mase,2.627369,1.136648,0.826997,0.808165,8.927829,0.809785


Fit model on full training data

In [34]:
for comp in aqi_comps:
    e, t, s = best_params[comp]
    model = ETSModel(aqi_df[comp], error=e, trend=t, seasonal=s)
    fit = model.fit(disp=False)
    fit.save(f"models/ets/{comp}.pickle")