In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.vector_ar.var_model import VAR
from kneed import KneeLocator
import plotly.express as px
register_matplotlib_converters()

### Read CSV files

In [2]:
cleaned_air_df = pd.read_csv("data/processed/cleaned/cleaned_air.csv", index_col=0, parse_dates=True)
cleaned_weather_df = pd.read_csv("data/processed/cleaned/cleaned_weather.csv", index_col=0, parse_dates=True)

Data overview

In [3]:
print(cleaned_air_df.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 29497 entries, 2022-08-04 00:00:00 to 2025-12-15 00:00:00
Columns: 6 entries, carbon_monoxide to sulphur_dioxide
dtypes: float64(6)
memory usage: 1.6 MB
None


In [4]:
print(cleaned_air_df.describe().transpose())

                    count        mean         std   min    25%    50%    75%  \
carbon_monoxide   29497.0  715.596773  476.466105  59.0  428.0  577.0  829.0   
pm10              29497.0   57.491555   33.745879   0.4   33.8   49.2   72.4   
pm2_5             29497.0   44.408394   27.159782   0.3   25.4   37.6   55.8   
nitrogen_dioxide  29497.0   26.987833   20.645734   0.0   12.3   21.4   35.3   
ozone             29497.0   73.473574   54.468030   0.0   35.0   58.0   99.0   
sulphur_dioxide   29497.0   25.576798   12.723920   0.2   16.6   23.3   31.8   

                     max  
carbon_monoxide   5115.0  
pm10               291.5  
pm2_5              232.8  
nitrogen_dioxide   161.0  
ozone              383.0  
sulphur_dioxide    104.6  


In [5]:
print(cleaned_weather_df.describe().transpose())

                        count         mean         std    min     25%     50%  \
temperature_2m        29497.0    24.349595    5.370000    7.2    20.8    25.2   
relative_humidity_2m  29497.0    79.202190   14.582084   21.0    70.0    82.0   
dew_point_2m          29497.0    20.233668    6.140256   -4.6    16.5    22.6   
precipitation         29497.0     0.272685    1.142549    0.0     0.0     0.0   
surface_pressure      29497.0  1009.563735    7.405214  982.2  1003.6  1009.2   
cloud_cover           29497.0    72.550056   38.175740    0.0    38.0    99.0   
wind_speed_10m        29497.0     9.173455    4.595286    0.0     5.8     8.7   
wind_direction_10m    29497.0   143.776418  100.225643    1.0    64.0   135.0   

                         75%     max  
temperature_2m          28.1    39.0  
relative_humidity_2m    92.0   100.0  
dew_point_2m            25.1    29.1  
precipitation            0.1    32.8  
surface_pressure      1015.2  1032.9  
cloud_cover            100.0   100.0

### Line plots for each attribute in air data

In [6]:
def savePlotAttributes(df, dir, resample_mode=None):
    os.makedirs(dir, exist_ok=True)
    
    for attribute in df.columns:
        fig, ax = plt.subplots(figsize=(16, 6))
        
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]

        ax.plot(data)
        ax.set_title(f'{attribute} (Single Location)', fontsize=16)

        for year in range(2022, 2026):
            ax.axvline(datetime(year, 1, 1), linestyle='--', color='k', alpha=0.5)

        file_name = os.path.join(dir, f"{attribute}.png")
        plt.tight_layout()
        plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved plot: {file_name}")

In [7]:
savePlotAttributes(cleaned_air_df, 'plots/air/cleaned/hourly_air_measurements')
savePlotAttributes(cleaned_weather_df, 'plots/weather/cleaned/hourly_weather_measurements')

Saved plot: plots/air/cleaned/hourly_air_measurements/carbon_monoxide.png
Saved plot: plots/air/cleaned/hourly_air_measurements/pm10.png
Saved plot: plots/air/cleaned/hourly_air_measurements/pm2_5.png
Saved plot: plots/air/cleaned/hourly_air_measurements/nitrogen_dioxide.png
Saved plot: plots/air/cleaned/hourly_air_measurements/ozone.png
Saved plot: plots/air/cleaned/hourly_air_measurements/sulphur_dioxide.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/temperature_2m.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/relative_humidity_2m.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/dew_point_2m.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/precipitation.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/surface_pressure.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/cloud_cover.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/wind_speed_10m.png
Saved plot: plots

### Save ACF and PACF plots for each attribute in weather data and air data

In [8]:
def savePlotACFandPACF(df, dir, acf_lags=25, pacf_lags=25, resample_mode=None):
  
    os.makedirs(dir, exist_ok=True)
    
  
    for attribute in df.columns:

        fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=80)
        fig.suptitle(f"ACF and PACF for {attribute}: ", fontsize=20, y=0.95)

        if resample_mode:

            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]


        if data.empty:
            print(f"Skipping {attribute}: No valid data available after dropping NaNs.")
            plt.close(fig)
            continue

        plot_acf(data, ax=axes[0], lags=acf_lags, title=f'{attribute} ACF')
        
        plot_pacf(data, ax=axes[1], lags=pacf_lags, title=f'{attribute} PACF')
            
        plt.tight_layout(rect=[0, 0, 1, 0.95], pad=2)
        
        file_name = os.path.join(dir, f"{attribute}_acf_and_pacf.png")
        plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved ACF/PACF plot for {attribute}")


In [9]:
savePlotACFandPACF(cleaned_air_df, 'plots/air/cleaned/acf_and_pacf', 25, 25)
savePlotACFandPACF(cleaned_weather_df, 'plots/weather/cleaned/acf_and_pacf', 25, 25)

Saved ACF/PACF plot for carbon_monoxide
Saved ACF/PACF plot for pm10
Saved ACF/PACF plot for pm2_5
Saved ACF/PACF plot for nitrogen_dioxide
Saved ACF/PACF plot for ozone
Saved ACF/PACF plot for sulphur_dioxide
Saved ACF/PACF plot for temperature_2m
Saved ACF/PACF plot for relative_humidity_2m
Saved ACF/PACF plot for dew_point_2m
Saved ACF/PACF plot for precipitation
Saved ACF/PACF plot for surface_pressure
Saved ACF/PACF plot for cloud_cover
Saved ACF/PACF plot for wind_speed_10m
Saved ACF/PACF plot for wind_direction_10m


### Augmented Dickey-Fuller unit root test for stationarity

In [10]:
def perform_adf_test(df, resample_mode=None):
    non_stationary_list = []

    for attribute in df.columns:
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]
        
        if len(data) < 20: 
            print(f"{attribute:<10} | N/A        | Insufficient Data")
            continue

        try:
            result = adfuller(data)
            p_value = result[1]
            
            is_stationary = p_value < 0.05
            status = "Stationary" if is_stationary else "Non-Stationary"
            
            if not is_stationary:
                non_stationary_list.append(attribute)
                
        except Exception as e:
            print(f"{attribute:<10} | Error      | {e}")
    
    if non_stationary_list:
        print('\nNon-stationary attribute(s) found:')
        print(non_stationary_list)
    else:
        print('\nAll attributes are stationary.')


In [11]:
perform_adf_test(cleaned_air_df)


All attributes are stationary.


In [12]:
perform_adf_test(cleaned_weather_df)


All attributes are stationary.


### Correlation matrix between air attributes

In [13]:
def plotCorrelationMatrix(df1, df2, title):
    df1 = df1.reset_index().set_index(['time'])
    df2 = df2.reset_index().set_index(['time'])
    correlation_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)
    for attr1 in df1.columns:
        for attr2 in df2.columns:
            correlation_matrix.loc[attr1, attr2] = df1[attr1].corr(df2[attr2])
    fig = px.imshow(
    correlation_matrix,
    labels=dict(color="Correlation"),
    color_continuous_scale='PRGn',
    title=title
    )
    fig.show()

In [14]:
cleaned_air_df.index.name = 'time'
cleaned_weather_df.index.name = 'time'


plotCorrelationMatrix(cleaned_weather_df, cleaned_air_df, "Clened air and weather data correlation")

### STL Decomposition + Residuals plot for outlier detection

In [15]:
def saveResidualsAttributes(df, dir, resample_mode=None):
    os.makedirs(dir, exist_ok=True)

    for attribute in df.columns:
        fig, ax = plt.subplots(figsize=(12, 6))
        fig.suptitle(f'{attribute} Residuals', fontsize=20, y=0.95)
        
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]

        if len(data) < 24 * 2:
            print(f"Skipping {attribute}: Insufficient data for STL decomposition.")
            plt.close(fig)
            continue
            
        try:
            stl = STL(data, period=24) 
            res = stl.fit()
            
            ax.plot(res.resid.index, res.resid.values, color='black', linestyle='--', linewidth=0.5)
            ax.set_title(f'{attribute} Residuals', fontsize=16)
            
            for year in range(2022, 2026):
                ax.axvline(datetime(year, 1, 1), linestyle='--', color='k', alpha=0.5)
            
            plt.tight_layout(rect=[0, 0, 1, 0.95])
            
            file_name = os.path.join(dir, f"{attribute}_residuals.png")
            plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
            plt.close(fig)
            print(f"Saved residuals plot for {attribute}")
            
        except Exception as e:
            print(f"Error decomposing {attribute}: {e}")
            plt.close(fig)

In [16]:
saveResidualsAttributes(cleaned_air_df, "plots/air/cleaned/residual")

Saved residuals plot for carbon_monoxide
Saved residuals plot for pm10
Saved residuals plot for pm2_5
Saved residuals plot for nitrogen_dioxide
Saved residuals plot for ozone
Saved residuals plot for sulphur_dioxide


In [17]:
saveResidualsAttributes(cleaned_weather_df, "plots/weather/cleaned/residual")

Saved residuals plot for temperature_2m
Saved residuals plot for relative_humidity_2m
Saved residuals plot for dew_point_2m
Saved residuals plot for precipitation
Saved residuals plot for surface_pressure
Saved residuals plot for cloud_cover
Saved residuals plot for wind_speed_10m
Saved residuals plot for wind_direction_10m


Multivariate Granger Causality Test and Optimal Lag

In [18]:
def granger_causality_and_optimal_lag(weather_df, air_df, air_attributes, weather_attributes, max_lag):

    data = pd.concat([weather_df, air_df], axis=1)
    
    model = VAR(data)
    results = model.fit(maxlags=max_lag)
    
    causality = results.test_causality(air_attributes, weather_attributes)
    
    if causality.conclusion == 'fail to reject':
        print(f'Failed to reject H0 (Weather does NOT cause Air Quality)')
    else:
        print('Weather variables are Granger-causal for air variables')
        
    lag_order_results = model.select_order(maxlags=max_lag)
    hqic_values = lag_order_results.ics['hqic']
    lags = range(len(hqic_values))
    
    knee_locator = KneeLocator(lags, hqic_values, curve="convex", direction="decreasing")
    optimal_lag = knee_locator.knee
    
    if optimal_lag is None:
        optimal_lag = lag_order_results.selected_orders['hqic']

    print(f'Optimal lag: {optimal_lag}')

air_attributes = cleaned_air_df.columns[1:].values
weather_attributes = cleaned_weather_df.columns[1:].values
granger_causality_and_optimal_lag(cleaned_weather_df, cleaned_air_df, air_attributes, weather_attributes, max_lag=25)


No frequency information was provided, so inferred frequency h will be used.



Weather variables are Granger-causal for air variables
Optimal lag: 2
