In [24]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.vector_ar.var_model import VAR
from kneed import KneeLocator
import plotly.express as px
register_matplotlib_converters()

### Read CSV files

In [25]:
cleaned_air_df = pd.read_csv("data/processed/cleaned/cleaned_air.csv", index_col=0, parse_dates=True)
interpolated_air_df = pd.read_csv("data/processed/interpolated/interpolated_air.csv", index_col=0, parse_dates=True)
cleaned_weather_df = pd.read_csv("data/processed/cleaned/cleaned_weather.csv", index_col=0, parse_dates=True)

Data overview

In [26]:
print(cleaned_air_df.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 34581 entries, 2020-12-01 00:00:00 to 2024-12-01 00:00:00
Columns: 6 entries, co to pm10
dtypes: float64(6)
memory usage: 1.8 MB
None


In [27]:
print(interpolated_air_df.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35065 entries, 2020-12-01 00:00:00 to 2024-12-01 00:00:00
Columns: 6 entries, co to pm10
dtypes: float64(6)
memory usage: 1.9 MB
None


In [28]:
print(cleaned_air_df.describe().transpose())

         count         mean          std     min     25%      50%      75%  \
co     34581.0  1850.810597  1675.463381  327.11  867.84  1255.04  2082.82   
no2    34581.0    49.156359    25.992303    4.41   32.22    43.87    60.32   
o3     34581.0    32.711899    53.455816    0.00    0.10     8.23    43.99   
so2    34581.0    40.428523    21.966358    2.41   25.99    34.81    48.64   
pm2_5  34581.0   111.864542   101.973397    2.79   42.79    78.07   143.83   
pm10   34581.0   128.664462   114.101433    3.49   52.77    90.89   162.63   

            max  
co     14312.74  
no2      260.47  
o3       583.65  
so2      267.03  
pm2_5    821.58  
pm10     926.83  


In [29]:
print(cleaned_weather_df.describe().transpose())

                        count         mean        std    min     25%     50%  \
temperature_2m        35065.0    24.176618   5.611789    6.4    20.4    25.1   
relative_humidity_2m  35065.0    79.155996  14.548216   21.0    69.0    82.0   
dew_point_2m          35065.0    20.058605   6.307193   -7.9    16.2    22.3   
precipitation         35065.0     0.249306   1.054648    0.0     0.0     0.0   
surface_pressure      35065.0  1009.479512   7.366849  982.2  1003.4  1009.0   
cloud_cover           35065.0    73.038158  37.178421    0.0    42.0    99.0   
wind_speed_10m        35065.0     9.573344   4.661646    0.0     6.0     9.0   
wind_direction_10m    35065.0   142.892999  97.698006    1.0    67.0   137.0   

                         75%     max  
temperature_2m          28.1    39.0  
relative_humidity_2m    92.0   100.0  
dew_point_2m            25.1    29.1  
precipitation            0.1    32.8  
surface_pressure      1015.1  1032.9  
cloud_cover            100.0   100.0  
wind_s

In [30]:
print(interpolated_air_df.describe().transpose())

         count         mean          std     min     25%      50%      75%  \
co     35065.0  1849.898941  1667.904322  327.11  881.20  1268.39  2082.82   
no2    35065.0    49.097094    25.845598    4.41   32.22    43.87    59.63   
o3     35065.0    32.354613    53.191962    0.00    0.10     7.96    43.63   
so2    35065.0    40.457661    21.858741    2.41   26.23    34.81    48.64   
pm2_5  35065.0   112.014047   102.003172    2.79   42.92    78.33   143.89   
pm10   35065.0   128.807696   114.093949    3.49   52.98    91.21   162.70   

            max  
co     14312.74  
no2      260.47  
o3       583.65  
so2      267.03  
pm2_5    821.58  
pm10     926.83  


### Line plots for each attribute in air data

In [31]:
def savePlotAttributes(df, dir, resample_mode=None):
    os.makedirs(dir, exist_ok=True)
    
    for attribute in df.columns:
        fig, ax = plt.subplots(figsize=(16, 6))
        
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]

        ax.plot(data)
        ax.set_title(f'{attribute} (Single Location)', fontsize=16)

        for year in range(2021, 2025):
            ax.axvline(datetime(year, 1, 1), linestyle='--', color='k', alpha=0.5)

        file_name = os.path.join(dir, f"{attribute}.png")
        plt.tight_layout()
        plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved plot: {file_name}")

In [32]:
savePlotAttributes(cleaned_air_df, 'plots/air/cleaned/hourly_air_measurements')
savePlotAttributes(cleaned_weather_df, 'plots/weather/cleaned/hourly_weather_measurements')

Saved plot: plots/air/cleaned/hourly_air_measurements/co.png
Saved plot: plots/air/cleaned/hourly_air_measurements/no2.png
Saved plot: plots/air/cleaned/hourly_air_measurements/o3.png
Saved plot: plots/air/cleaned/hourly_air_measurements/so2.png
Saved plot: plots/air/cleaned/hourly_air_measurements/pm2_5.png
Saved plot: plots/air/cleaned/hourly_air_measurements/pm10.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/temperature_2m.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/relative_humidity_2m.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/dew_point_2m.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/precipitation.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/surface_pressure.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/cloud_cover.png
Saved plot: plots/weather/cleaned/hourly_weather_measurements/wind_speed_10m.png
Saved plot: plots/weather/cleaned/hourly_weather_measureme

In [33]:
savePlotAttributes(interpolated_air_df, 'plots/air/interpolated/hourly_air_measurements')

Saved plot: plots/air/interpolated/hourly_air_measurements/co.png
Saved plot: plots/air/interpolated/hourly_air_measurements/no2.png
Saved plot: plots/air/interpolated/hourly_air_measurements/o3.png
Saved plot: plots/air/interpolated/hourly_air_measurements/so2.png
Saved plot: plots/air/interpolated/hourly_air_measurements/pm2_5.png
Saved plot: plots/air/interpolated/hourly_air_measurements/pm10.png


### Save ACF and PACF plots for each attribute in weather data and air data

In [34]:
def savePlotACFandPACF(df, dir, acf_lags=25, pacf_lags=25, resample_mode=None):
  
    os.makedirs(dir, exist_ok=True)
    
  
    for attribute in df.columns:

        fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=80)
        fig.suptitle(f"ACF and PACF for {attribute}: ", fontsize=20, y=0.95)

        if resample_mode:

            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]


        if data.empty:
            print(f"Skipping {attribute}: No valid data available after dropping NaNs.")
            plt.close(fig)
            continue

        plot_acf(data, ax=axes[0], lags=acf_lags, title=f'{attribute} ACF')
        
        plot_pacf(data, ax=axes[1], lags=pacf_lags, title=f'{attribute} PACF')
            
        plt.tight_layout(rect=[0, 0, 1, 0.95], pad=2)
        
        file_name = os.path.join(dir, f"{attribute}_acf_and_pacf.png")
        plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved ACF/PACF plot for {attribute}")


In [35]:
savePlotACFandPACF(cleaned_air_df, 'plots/air/cleaned/acf_and_pacf', 25, 25)
savePlotACFandPACF(cleaned_weather_df, 'plots/weather/cleaned/acf_and_pacf', 25, 25)

Saved ACF/PACF plot for co
Saved ACF/PACF plot for no2
Saved ACF/PACF plot for o3
Saved ACF/PACF plot for so2
Saved ACF/PACF plot for pm2_5
Saved ACF/PACF plot for pm10
Saved ACF/PACF plot for temperature_2m
Saved ACF/PACF plot for relative_humidity_2m
Saved ACF/PACF plot for dew_point_2m
Saved ACF/PACF plot for precipitation
Saved ACF/PACF plot for surface_pressure
Saved ACF/PACF plot for cloud_cover
Saved ACF/PACF plot for wind_speed_10m
Saved ACF/PACF plot for wind_direction_10m


In [36]:
savePlotACFandPACF(interpolated_air_df, 'plots/air/interpolated/acf_and_pacf', 25, 25)

Saved ACF/PACF plot for co
Saved ACF/PACF plot for no2
Saved ACF/PACF plot for o3
Saved ACF/PACF plot for so2
Saved ACF/PACF plot for pm2_5
Saved ACF/PACF plot for pm10


### Augmented Dickey-Fuller unit root test for stationarity

In [37]:
def perform_adf_test(df, resample_mode=None):
    non_stationary_list = []

    for attribute in df.columns:
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]
        
        if len(data) < 20: 
            print(f"{attribute:<10} | N/A        | Insufficient Data")
            continue

        try:
            result = adfuller(data)
            p_value = result[1]
            
            is_stationary = p_value < 0.05
            status = "Stationary" if is_stationary else "Non-Stationary"
            
            if not is_stationary:
                non_stationary_list.append(attribute)
                
        except Exception as e:
            print(f"{attribute:<10} | Error      | {e}")
    
    if non_stationary_list:
        print('\nNon-stationary attribute(s) found:')
        print(non_stationary_list)
    else:
        print('\nAll attributes are stationary.')


In [38]:
perform_adf_test(cleaned_air_df)


All attributes are stationary.


In [39]:
perform_adf_test(cleaned_weather_df)


All attributes are stationary.


Since ADF test for time series with gaps might be inaccurate, will also test on interpolated data

In [40]:
perform_adf_test(interpolated_air_df)


All attributes are stationary.


### Correlation matrix between air attributes

In [41]:
def plotCorrelationMatrix(df1, df2, title):
    df1 = df1.reset_index().set_index(['time'])
    df2 = df2.reset_index().set_index(['time'])
    correlation_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)
    for attr1 in df1.columns:
        for attr2 in df2.columns:
            correlation_matrix.loc[attr1, attr2] = df1[attr1].corr(df2[attr2])
    fig = px.imshow(
    correlation_matrix,
    labels=dict(color="Correlation"),
    color_continuous_scale='PRGn',
    title=title
    )
    fig.show()

In [42]:
interpolated_air_df.index.name = 'time'
cleaned_weather_df.index.name = 'time'


plotCorrelationMatrix(cleaned_weather_df, interpolated_air_df, "Interpolated data correlation")

### STL Decomposition + Residuals plot for outlier detection

In [43]:
def saveResidualsAttributes(df, dir, resample_mode=None):
    os.makedirs(dir, exist_ok=True)

    for attribute in df.columns:
        fig, ax = plt.subplots(figsize=(12, 6))
        fig.suptitle(f'{attribute} Residuals', fontsize=20, y=0.95)
        
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]

        if len(data) < 24 * 2:
            print(f"Skipping {attribute}: Insufficient data for STL decomposition.")
            plt.close(fig)
            continue
            
        try:
            stl = STL(data, period=24) 
            res = stl.fit()
            
            ax.plot(res.resid.index, res.resid.values, color='black', linestyle='--', linewidth=0.5)
            ax.set_title(f'{attribute} Residuals', fontsize=16)
            
            for year in range(2022, 2026):
                ax.axvline(datetime(year, 1, 1), linestyle='--', color='k', alpha=0.5)
            
            plt.tight_layout(rect=[0, 0, 1, 0.95])
            
            file_name = os.path.join(dir, f"{attribute}_residuals.png")
            plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
            plt.close(fig)
            print(f"Saved residuals plot for {attribute}")
            
        except Exception as e:
            print(f"Error decomposing {attribute}: {e}")
            plt.close(fig)

In [44]:
saveResidualsAttributes(interpolated_air_df, "plots/air/interpolated/residual")

Saved residuals plot for co
Saved residuals plot for no2
Saved residuals plot for o3
Saved residuals plot for so2
Saved residuals plot for pm2_5
Saved residuals plot for pm10


In [45]:
saveResidualsAttributes(cleaned_air_df, "plots/air/cleaned/residual")

Saved residuals plot for co
Saved residuals plot for no2
Saved residuals plot for o3
Saved residuals plot for so2
Saved residuals plot for pm2_5
Saved residuals plot for pm10


In [46]:
saveResidualsAttributes(cleaned_weather_df, "plots/weather/cleaned/residual")

Saved residuals plot for temperature_2m
Saved residuals plot for relative_humidity_2m
Saved residuals plot for dew_point_2m
Saved residuals plot for precipitation
Saved residuals plot for surface_pressure
Saved residuals plot for cloud_cover
Saved residuals plot for wind_speed_10m
Saved residuals plot for wind_direction_10m


Multivariate Granger Causality Test and Optimal Lag

In [47]:
def granger_causality_and_optimal_lag(weather_df, air_df, air_attributes, weather_attributes, max_lag):

    data = pd.concat([weather_df, air_df], axis=1)
    
    model = VAR(data)
    results = model.fit(maxlags=max_lag)
    
    causality = results.test_causality(air_attributes, weather_attributes)
    
    if causality.conclusion == 'fail to reject':
        print(f'Failed to reject H0 (Weather does NOT cause Air Quality)')
    else:
        print('Weather variables are Granger-causal for air variables')
        
    lag_order_results = model.select_order(maxlags=max_lag)
    hqic_values = lag_order_results.ics['hqic']
    lags = range(len(hqic_values))
    
    knee_locator = KneeLocator(lags, hqic_values, curve="convex", direction="decreasing")
    optimal_lag = knee_locator.knee
    
    if optimal_lag is None:
        optimal_lag = lag_order_results.selected_orders['hqic']

    print(f'Optimal lag: {optimal_lag}')

air_attributes = interpolated_air_df.columns[1:].values
weather_attributes = cleaned_weather_df.columns[1:].values
granger_causality_and_optimal_lag(cleaned_weather_df, interpolated_air_df, air_attributes, weather_attributes, max_lag=25)


No frequency information was provided, so inferred frequency h will be used.



Weather variables are Granger-causal for air variables
Optimal lag: 2
