# VAR Model with Air Quality and Weather Data

In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.vector_ar.var_model import VAR
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

## Load Data

In [2]:
aqi_df = pd.read_csv('data/processed/cleaned/cleaned_air.csv', index_col=0, parse_dates=True)
aqi_df.index.freq = "h"

weather_df = pd.read_csv('data/processed/cleaned/cleaned_weather.csv', index_col=0, parse_dates=True)
weather_df.index.freq = "h"

combined_df = pd.concat([aqi_df, weather_df], axis=1)
combined_df.index.freq = "h"

print(f"Combined data shape: {combined_df.shape}")

Combined data shape: (29497, 14)


## Evaluation Metrics

- sMAPE (Symmetric Mean Absolute Percentage Error)

- MASE (Mean Absolute Scaled Error)

In [3]:
def smape(actual, predicted):
    """Symmetric MAPE - avoids division by zero."""
    actual = np.array(actual)
    predicted = np.array(predicted)
    denominator = np.abs(actual) + np.abs(predicted)
    denominator = np.where(denominator == 0, 1, denominator)
    return np.mean(2 * np.abs(actual - predicted) / denominator)

def mase_h_step(actual, predicted, train, h):
    """MASE scaled by h-step naive error."""
    mae = mean_absolute_error(actual, predicted)
    naive_errors = np.abs(train.values[h:] - train.values[:-h])
    naive_mae = np.mean(naive_errors)
    return mae / naive_mae if naive_mae > 0 else np.nan

## Parameters

In [4]:
p = 2  # VAR lag order

## Model Cross-Validation

Evaluate at multiple forecast horizons: 1, 3, and 6 hours ahead

In [5]:
splits = 5
horizons = [1, 3, 6]
max_horizon = max(horizons)
all_cols = combined_df.columns
aqi_cols = aqi_df.columns
tscv = TimeSeriesSplit(n_splits=splits, test_size=max_horizon)

In [6]:
results_by_horizon = {h: {comp: {'smape': [], 'mase': []} for comp in aqi_cols} for h in horizons}

for train_index, val_index in tscv.split(combined_df):
    train, val = combined_df.iloc[train_index], combined_df.iloc[val_index]
    model = VAR(train)
    fit = model.fit(maxlags=p)
    
    forecast = fit.forecast(train.values[-p:], steps=max_horizon)
    forecast_df = pd.DataFrame(forecast, index=val.index, columns=all_cols)
    
    for h in horizons:
        for comp in aqi_cols:
            val_h = val[comp].iloc[:h]
            forecast_h = forecast_df[comp].iloc[:h]
            
            smape_val = smape(val_h.values, forecast_h.values)
            mase_val = mase_h_step(val_h.values, forecast_h.values, train[comp], h)
            
            results_by_horizon[h][comp]['smape'].append(smape_val)
            results_by_horizon[h][comp]['mase'].append(mase_val)

print("Cross-validation complete!")

Cross-validation complete!


In [7]:
print("Air+Weather VAR Results:")
for h in horizons:
    print(f"\nHorizon: {h} hour(s)")
    smape_means = {comp: np.mean(results_by_horizon[h][comp]['smape']) for comp in aqi_cols}
    mase_means = {comp: np.mean(results_by_horizon[h][comp]['mase']) for comp in aqi_cols}
    results = pd.DataFrame({'smape': smape_means, 'mase': mase_means}).T
    display(results)

Air+Weather VAR Results:

Horizon: 1 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.094932,0.097734,0.100487,0.164075,0.318437,0.033563
mase,1.889906,0.679834,0.829164,0.402509,0.336511,0.624677



Horizon: 3 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.188901,0.125054,0.129117,0.250581,0.398089,0.075814
mase,1.374418,0.380683,0.439269,0.345722,0.233837,0.676862



Horizon: 6 hour(s)


Unnamed: 0,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
smape,0.316271,0.196638,0.187117,0.327473,0.621126,0.174595
mase,1.387973,0.485281,0.571713,0.507235,0.304412,0.964967


## Compare with Air-Only VAR

In [8]:
results_air_only = {h: {comp: {'smape': [], 'mase': []} for comp in aqi_cols} for h in horizons}

for train_index, val_index in tscv.split(aqi_df):
    train, val = aqi_df.iloc[train_index], aqi_df.iloc[val_index]
    model = VAR(train)
    fit = model.fit(maxlags=2)
    
    forecast = fit.forecast(train.values[-2:], steps=max_horizon)
    forecast_df = pd.DataFrame(forecast, index=val.index, columns=aqi_cols)
    
    for h in horizons:
        for comp in aqi_cols:
            val_h = val[comp].iloc[:h]
            forecast_h = forecast_df[comp].iloc[:h]
            
            smape_val = smape(val_h.values, forecast_h.values)
            mase_val = mase_h_step(val_h.values, forecast_h.values, train[comp], h)
            
            results_air_only[h][comp]['smape'].append(smape_val)
            results_air_only[h][comp]['mase'].append(mase_val)

In [9]:
print("Comparison: Air+Weather VAR vs Air-Only VAR")
print("(Positive improvement % = Air+Weather is better)\n")

for h in horizons:
    print(f"\n=== Horizon: {h} hour(s) ===")
    comparison = pd.DataFrame({
        'sMAPE (Air+Weather)': {comp: np.mean(results_by_horizon[h][comp]['smape']) for comp in aqi_cols},
        'sMAPE (Air Only)': {comp: np.mean(results_air_only[h][comp]['smape']) for comp in aqi_cols},
        'MASE (Air+Weather)': {comp: np.mean(results_by_horizon[h][comp]['mase']) for comp in aqi_cols},
        'MASE (Air Only)': {comp: np.mean(results_air_only[h][comp]['mase']) for comp in aqi_cols}
    })
    comparison['MASE Improvement %'] = (comparison['MASE (Air Only)'] - comparison['MASE (Air+Weather)']) / comparison['MASE (Air Only)'] * 100
    display(comparison)

Comparison: Air+Weather VAR vs Air-Only VAR
(Positive improvement % = Air+Weather is better)


=== Horizon: 1 hour(s) ===


Unnamed: 0,sMAPE (Air+Weather),sMAPE (Air Only),MASE (Air+Weather),MASE (Air Only),MASE Improvement %
carbon_monoxide,0.094932,0.077661,1.889906,1.835612,-2.957851
pm10,0.097734,0.086985,0.679834,0.564792,-20.368837
pm2_5,0.100487,0.089583,0.829164,0.705806,-17.477597
nitrogen_dioxide,0.164075,0.148713,0.402509,0.507786,20.732473
ozone,0.318437,0.428206,0.336511,0.43355,22.382357
sulphur_dioxide,0.033563,0.030517,0.624677,0.56988,-9.615546



=== Horizon: 3 hour(s) ===


Unnamed: 0,sMAPE (Air+Weather),sMAPE (Air Only),MASE (Air+Weather),MASE (Air Only),MASE Improvement %
carbon_monoxide,0.188901,0.19742,1.374418,1.503874,8.608182
pm10,0.125054,0.094089,0.380683,0.22723,-67.531743
pm2_5,0.129117,0.09927,0.439269,0.276137,-59.076506
nitrogen_dioxide,0.250581,0.212119,0.345722,0.248142,-39.324287
ozone,0.398089,0.454848,0.233837,0.256402,8.800626
sulphur_dioxide,0.075814,0.076576,0.676862,0.629329,-7.552927



=== Horizon: 6 hour(s) ===


Unnamed: 0,sMAPE (Air+Weather),sMAPE (Air Only),MASE (Air+Weather),MASE (Air Only),MASE Improvement %
carbon_monoxide,0.316271,0.370951,1.387973,1.632649,14.986446
pm10,0.196638,0.169844,0.485281,0.381916,-27.064588
pm2_5,0.187117,0.175499,0.571713,0.477521,-19.725159
nitrogen_dioxide,0.327473,0.349253,0.507235,0.513318,1.184993
ozone,0.621126,0.672805,0.304412,0.392603,22.463234
sulphur_dioxide,0.174595,0.174752,0.964967,0.912246,-5.779233


## Save Both Models

Different pollutants benefit from different models:
- **Air+Weather**: Best for ozone (weather-driven)
- **Air-Only**: Best for pm10, pm2_5 (weather adds noise)

In [10]:
# Save Air+Weather VAR
model_combined = VAR(combined_df)
fit_combined = model_combined.fit(maxlags=p)
fit_combined.save("models/var/var_air_weather.pickle")
print(f"Saved VAR (Air+Weather) with lag order {p}")

# Save Air-Only VAR
model_air = VAR(aqi_df)
fit_air = model_air.fit(maxlags=p)
fit_air.save("models/var/var_air_only.pickle")
print(f"Saved VAR (Air-Only) with lag order {p}")

Saved VAR (Air+Weather) with lag order 2
Saved VAR (Air-Only) with lag order 2
