In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

Load AQI data

In [19]:
aqi_df = pd.read_csv('data/processed/cleaned/cleaned_air.csv')
aqi_df['time'] = pd.to_datetime(aqi_df['time'])
aqi_df.set_index('time', inplace=True)
aqi_df.index.freq = "h"

Load extra weather data

In [20]:
weather_df = pd.read_csv('data/processed/cleaned/cleaned_weather.csv')
weather_df['time'] = pd.to_datetime(weather_df['time'])
weather_df.set_index('time', inplace=True)
weather_df.index.freq = "h"

Parameter (obtained from data_exploration.ipynb, after EDA process)

In [21]:
p = 2
d = 0
q = 0

Model cross-validation

In [22]:
splits = 5
val_len = 24
aqi_comps = aqi_df.columns
tscv = TimeSeriesSplit(n_splits=splits, test_size=val_len)

In [23]:
mape_list = {}
mase_list = {}

for comp in aqi_comps:
    mape_scores = []
    mase_scores = []
    
    for train_index, val_index in tscv.split(aqi_df):
        train, val = aqi_df[comp].iloc[train_index], aqi_df[comp].iloc[val_index]
        ex_train, ex_val = weather_df.iloc[train_index], weather_df.iloc[val_index]
        model = SARIMAX(train, exog=ex_train,  order=(p, d, q))
        fit = model.fit()
        
        forecast = fit.forecast(steps=len(val), exog=ex_val)
        
        mape = mean_absolute_percentage_error(val, forecast)
        if mape > 5:
            continue
        mae = mean_absolute_error(val, forecast)
        # Naive prediction: use value at t to guess value at t + len(val) 
        naive_mae = np.mean(np.abs(train[-len(val):].values - val.values))
        mase = mae / naive_mae
        mape_scores.append(mape)
        mase_scores.append(mase)
    
    mape_list[comp] = np.mean(mape_scores)
    mase_list[comp] = np.mean(mase_scores)



In [24]:
results = pd.DataFrame({'mape': mape_list, 'mase': mase_list}).T
results

Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
mape,0.351561,0.808655,0.711701,0.822918,0.499016,0.377029
mase,1.275577,0.783797,0.825626,1.001408,1.603136,0.953055


Fit model on full training data

In [25]:
for comp in aqi_comps:
    model = SARIMAX(aqi_df[comp], exog=weather_df, order=(p, d, q))
    fit = model.fit()
    fit.save(f"models/arimax/{comp}.pickle")

