In [1]:
import pandas as pd
import os
from datetime import datetime
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller
import plotly.express as px
register_matplotlib_converters()

### Read CSV files

In [2]:
cleaned_air_df = pd.read_csv("data/processed/cleaned_air.csv", index_col=0, parse_dates=True)
interpolated_air_df = pd.read_csv("data/processed/interpolated_air.csv", index_col=0, parse_dates=True)
dropped_air_df = pd.read_csv("data/processed/dropped_air.csv", index_col=0, parse_dates=True)

Data overview

In [3]:
print(cleaned_air_df.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 60672 entries, 2019-01-01 00:00:00 to 2025-12-07 23:00:00
Columns: 6 entries, co to so2
dtypes: float64(6)
memory usage: 3.2 MB
None


In [4]:
print(interpolated_air_df.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 60792 entries, 2019-01-01 00:00:00 to 2025-12-07 23:00:00
Columns: 6 entries, co to so2
dtypes: float64(6)
memory usage: 3.2 MB
None


In [5]:
print(dropped_air_df.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 6 entries, co to so2
dtypes: object(6)
memory usage: 0.0+ bytes
None


In [6]:
print(cleaned_air_df.describe().transpose())

        count        mean         std  min    25%    50%    75%     max
co    42385.0  580.943994  183.581222  0.0  447.0  575.0  711.0  1440.0
no2   43588.0   40.249635   22.897084  0.0   25.2   36.3   50.2   229.0
o3    43587.0   46.909794   33.883672  0.0   20.6   40.2   67.6   414.0
pm10  42810.0   24.013203   17.087322  0.0   12.6   20.6   31.0   251.0
pm25  43732.0   15.744732   11.102358  0.0    8.1   13.3   20.4   126.0
so2   43567.0    5.106608    3.101263  0.0    2.4    4.9    7.2    61.4


In [7]:
print(interpolated_air_df.describe().transpose())

        count        mean         std  min         25%         50%  \
co    60792.0  566.964048  179.735523  0.0  445.664553  533.000000   
no2   60792.0   39.192091   20.368916  0.0   27.962500   35.793847   
o3    60792.0   39.695462   34.111367  0.0   10.200000   31.700000   
pm10  60792.0   22.978044   15.234303  0.0   13.031555   20.300000   
pm25  60792.0   14.019057   10.389411  0.0    7.000000   11.000000   
so2   60792.0    5.719123    3.131043  0.0    3.200000    5.800000   

             75%     max  
co    691.000000  1440.0  
no2    46.200000   229.0  
o3     60.300000   414.0  
pm10   28.800000   251.0  
pm25   18.100000   126.0  
so2     7.986842    61.4  


In [8]:
print(dropped_air_df.describe().transpose())

     count unique  top freq
co       0      0  NaN  NaN
no2      0      0  NaN  NaN
o3       0      0  NaN  NaN
pm10     0      0  NaN  NaN
pm25     0      0  NaN  NaN
so2      0      0  NaN  NaN


### Line plots for each attribute in air data

In [9]:
def savePlotAttributes(df, dir, resample_mode=None):
    os.makedirs(dir, exist_ok=True)
    
    for attribute in df.columns:
        fig, ax = plt.subplots(figsize=(16, 6))
        
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]

        ax.plot(data)
        ax.set_title(f'{attribute} (Single Location)', fontsize=16)

        for year in range(2022, 2026):
            ax.axvline(datetime(year, 1, 1), linestyle='--', color='k', alpha=0.5)

        file_name = os.path.join(dir, f"{attribute}.png")
        plt.tight_layout()
        plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved plot: {file_name}")

In [10]:
# savePlotAttributes(cleaned_air_df, 'plots/air/cleaned')

In [11]:
# savePlotAttributes(interpolated_air_df, 'plots/air/interpolated')

In [12]:
# savePlotAttributes(dropped_air_df, 'plots/air/dropped')

### Save ACF and PACF plots for each attribute in weather data and air data

In [13]:
def savePlotACFandPACF(df, dir, acf_lags=25, pacf_lags=25, resample_mode=None):
  
    os.makedirs(dir, exist_ok=True)
    
  
    for attribute in df.columns:

        fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=80)
        fig.suptitle(f"ACF and PACF for {attribute}: ", fontsize=20, y=0.95)

        if resample_mode:

            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]


        data = data.dropna()

        if data.empty:
            print(f"Skipping {attribute}: No valid data available after dropping NaNs.")
            plt.close(fig)
            continue

        plot_acf(data, ax=axes[0], lags=acf_lags, title=f'{attribute} ACF')
        
        plot_pacf(data, ax=axes[1], lags=pacf_lags, title=f'{attribute} PACF')
            
        plt.tight_layout(rect=[0, 0, 1, 0.95], pad=2)
        
        file_name = os.path.join(dir, f"{attribute}_acf_and_pacf.png")
        plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved ACF/PACF plot for {attribute}")


In [14]:
# savePlotACFandPACF(cleaned_air_df, 'plots/air/cleaned', 25, 25)

In [15]:
# savePlotACFandPACF(interpolated_air_df, 'plots/air/interpolated', 25, 25)

In [16]:
# savePlotACFandPACF(dropped_air_df, 'plots/air/dropped', 50, 50)

### Augmented Dickey-Fuller unit root test for stationarity

In [17]:
def perform_adf_test(df, resample_mode=None):
    non_stationary_list = []

    for attribute in df.columns:
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]
        
        data = data.dropna()
        
        if len(data) < 20: 
            print(f"{attribute:<10} | N/A        | Insufficient Data")
            continue

        try:
            result = adfuller(data)
            p_value = result[1]
            
            is_stationary = p_value < 0.05
            status = "Stationary" if is_stationary else "Non-Stationary"
            
            if not is_stationary:
                non_stationary_list.append(attribute)
                
        except Exception as e:
            print(f"{attribute:<10} | Error      | {e}")
    
    if non_stationary_list:
        print('\nNon-stationary attribute(s) found:')
        print(non_stationary_list)
    else:
        print('\nAll attributes are stationary.')


In [18]:
perform_adf_test(cleaned_air_df)


All attributes are stationary.


Since ADF test for time series with gaps might be inaccurate, will also test on interpolated data

In [19]:
perform_adf_test(interpolated_air_df)


All attributes are stationary.


ADF test on dropped data to see if more gaps will cause non-stationrity

In [20]:
perform_adf_test(dropped_air_df)

co         | N/A        | Insufficient Data
no2        | N/A        | Insufficient Data
o3         | N/A        | Insufficient Data
pm10       | N/A        | Insufficient Data
pm25       | N/A        | Insufficient Data
so2        | N/A        | Insufficient Data

All attributes are stationary.


### Correlation matrix between air attributes

In [21]:
def plotCorrelationMatrix(df, title):
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    
    correlation_matrix = numeric_df.corr()
    
    fig = px.imshow(
        correlation_matrix,
        text_auto=False,           
        labels=dict(color="Correlation"),
        color_continuous_scale='PRGn', 
        zmin=-1, zmax=1,
        title=title
        
    )
    
    fig.show()

In [22]:
plotCorrelationMatrix(cleaned_air_df, "Cleaned Air Data Correlation")

In [23]:
plotCorrelationMatrix(interpolated_air_df, "Interpolated Air Data Correlation")

In [24]:
plotCorrelationMatrix(dropped_air_df, "Dropped Air Data Correlation")

### STL Decomposition + Residuals plot for outlier detection

In [25]:
def saveResidualsAttributes(df, dir, resample_mode=None):
    os.makedirs(dir, exist_ok=True)

    for attribute in df.columns:
        fig, ax = plt.subplots(figsize=(12, 6))
        fig.suptitle(f'{attribute} Residuals', fontsize=20, y=0.95)
        
        if resample_mode:
            data = df[attribute].resample(resample_mode).mean()
        else:
            data = df[attribute]

        data = data.dropna()
        
        # Check if we have enough data left for decomposition
        if len(data) < 24 * 2: # heuristic: need at least 2 periods
            print(f"Skipping {attribute}: Insufficient data for STL decomposition.")
            plt.close(fig)
            continue
            
        try:
            stl = STL(data, period=24) 
            res = stl.fit()
            
            ax.plot(res.resid.index, res.resid.values, color='black', linestyle='--', linewidth=0.5)
            ax.set_title(f'{attribute} Residuals', fontsize=16)
            
            for year in range(2022, 2026):
                ax.axvline(datetime(year, 1, 1), linestyle='--', color='k', alpha=0.5)
            
            plt.tight_layout(rect=[0, 0, 1, 0.95])
            
            file_name = os.path.join(dir, f"{attribute}_residuals.png")
            plt.savefig(file_name, format="png", dpi=300, bbox_inches="tight")
            plt.close(fig)
            print(f"Saved residuals plot for {attribute}")
            
        except Exception as e:
            print(f"Error decomposing {attribute}: {e}")
            plt.close(fig)

In [26]:
# saveResidualsAttributes(interpolated_air_df, "data/plots/residuals")

In [27]:
# saveResidualsAttributes(cleaned_air_df, "data/plots/residuals")

In [28]:
# saveResidualsAttributes(dropped_air_df, "data/plots/residuals")

Multivariate Granger Causality Test and Optimal Lag

In [29]:
from statsmodels.tsa.vector_ar.var_model import VAR
from kneed import KneeLocator

In [30]:
from statsmodels.tsa.api import VAR
from kneed import KneeLocator
import pandas as pd
import matplotlib.pyplot as plt

def test_granger_causality_for_target(df, air_attributes, target_attribute='pm25', max_lag=24):
    if target_attribute not in air_attributes:
        print(f"Error: Target '{target_attribute}' must be in the air_attributes list.")
        return
        
    missing_cols = [col for col in air_attributes if col not in df.columns]
    if missing_cols:
        print(f"Error: The following attributes are not in the DataFrame: {missing_cols}")
        return
    
    df_subset = df[air_attributes].dropna()
    
    if len(df_subset) < max_lag + 10:
        print("Error: Not enough data points.")
        return

    model = VAR(df_subset)
    lag_order_results = model.select_order(maxlags=max_lag)
    
    hqic_values = lag_order_results.ics['hqic']
    lags = range(len(hqic_values))
    
    knee_locator = KneeLocator(lags, hqic_values, curve="convex", direction="decreasing")
    optimal_lag = knee_locator.knee
    
    if optimal_lag is None or optimal_lag == 0:
        optimal_lag = 1
        print("Warning: No clear knee found. Defaulting to lag 1.")
    else:
        print(f"Optimal Lag found (HQIC Knee): {optimal_lag}")
    
    print("-" * 60)

    results = model.fit(optimal_lag)
    
    print(f"Testing Causality: Do other attributes Granger-cause {target_attribute}?")
    print(f"{'Causing Attribute':<20} | {'P-Value':<10} | {'Result'}")
    print("-" * 60)
    
    causality_found = False
    
    for causing in air_attributes:
        if causing == target_attribute:
            continue
            
        # The test_causality function syntax is: test_causality(caused, causing)
        test_result = results.test_causality(target_attribute, causing)
        p_value = test_result.pvalue
        
        is_significant = p_value < 0.05
        status = "Significant" if is_significant else "Not Significant"
        
        print(f"{causing:<20} -> {target_attribute:<10} | {p_value:.4f}     | {status}")
        
        if is_significant:
            causality_found = True
    
    if not causality_found:
        print(f"\nNo attributes were found to Granger-cause {target_attribute}.")

In [31]:
air_attributes = interpolated_air_df.columns.values
test_granger_causality_for_target(interpolated_air_df, air_attributes, target_attribute='pm25', max_lag=24)


No frequency information was provided, so inferred frequency h will be used.



Optimal Lag found (HQIC Knee): 1
------------------------------------------------------------
Testing Causality: Do other attributes Granger-cause pm25?
Causing Attribute    | P-Value    | Result
------------------------------------------------------------
co                   -> pm25       | 0.0000     | Significant
no2                  -> pm25       | 0.0000     | Significant
o3                   -> pm25       | 0.0000     | Significant
pm10                 -> pm25       | 0.0000     | Significant
so2                  -> pm25       | 0.0000     | Significant
