## Correlation analysis between all parameters, including: PM2.5, Temperature and humidity (measured and from open sources), Pressure, wind, visibility (from open sources)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import os

# Import the dataframe
df_interpolated = pd.read_csv("df_data_prepared.csv", encoding="utf-8", parse_dates=["date"])
df_corr = df_interpolated.copy()

# Select only numeric columns for correlation analysis
numeric_columns = df_corr.select_dtypes(include=[np.number]).columns

# Drop rows with NaN values across all numeric columns to ensure consistent lengths
df_clean = df_corr[numeric_columns].dropna()

# Calculate the correlation matrix
corr_matrix = df_clean.corr()

# Calculate p-values for statistical significance
p_matrix = pd.DataFrame(index=numeric_columns, columns=numeric_columns)
for col1 in numeric_columns:
    for col2 in numeric_columns:
        if col1 != col2:
            corr, p_val = pearsonr(df_clean[col1], df_clean[col2])
            p_matrix.loc[col1, col2] = p_val
        else:
            p_matrix.loc[col1, col2] = 1.0  # Diagonal elements (self-correlation) have p-value = 1

# Rename columns for display
new_labels = {
    "pm2_5": "PM2.5",
    "air_temperature": "T'",
    "air_humidity": "U'"
}
corr_matrix_renamed = corr_matrix.rename(columns=new_labels, index=new_labels)

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
ax = sns.heatmap(corr_matrix_renamed, annot=False, cmap="coolwarm", 
                 linewidths=0.5, vmin=-1, vmax=1)  # annot=False to avoid default annotations

# Add custom annotations: correlation coefficients and asterisks below
for i, row in enumerate(corr_matrix_renamed.index):
    for j, col in enumerate(corr_matrix_renamed.columns):
        if not pd.isna(corr_matrix_renamed.loc[row, col]):  # Skip NaN values
            corr_val = corr_matrix_renamed.loc[row, col]
            text = f"{corr_val:.2f}"
            # Place correlation coefficient
            ax.text(j + 0.5, i + 0.35, text, ha="center", va="center", fontsize=10)
            # Determine significance stars
            if row != col:  # Skip diagonal
                p_val = p_matrix.loc[numeric_columns[i], numeric_columns[j]]
                stars = ""
                if p_val < 0.001:
                    stars = "***"
                elif p_val < 0.01:
                    stars = "**"
                elif p_val < 0.05:
                    stars = "*"
                # Place stars below the coefficient
                ax.text(j + 0.5, i + 0.65, stars, ha="center", va="center", fontsize=10)

# Rotate axis labels to horizontal
plt.xticks(rotation=0)
plt.yticks(rotation=0)

# Ensure the output directory exists
output_dir = "output_diagrams"
os.makedirs(output_dir, exist_ok=True)

# Save the plot with specified resolution
plt.savefig(os.path.join(output_dir, "correlation_matrix_params.png"), dpi=600, bbox_inches="tight")
plt.show()

# Display significant correlations (|corr| > 0.3)
print("\nüìå Most significant correlations (|corr| > 0.3):")
strong_correlations = corr_matrix[abs(corr_matrix) > 0.3]
print(strong_correlations)

## Checking for non-linear dependencies of PM2.5 with weather parameters using logarithmic regression

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# 1Ô∏è‚É£ Data Preparation
df_regression = df_interpolated.copy()

# Keep only the necessary columns
features = ["air_temperature", "air_humidity", "P", "DD", "Ff", "VV"]
df_regression = df_regression[["pm2_5"] + features]

# Remove rows with missing values
df_regression = df_regression.dropna()

# Logarithmic transformation of PM2.5
df_regression["log_pm2_5"] = np.log1p(df_regression["pm2_5"])  # log(1 + x) to avoid log(0)

# 2Ô∏è‚É£ Logarithmic transformation of independent variables (only positive ones)
for col in features:
    if (df_regression[col] > 0).all():  # For positive values only
        df_regression[f"log_{col}"] = np.log1p(df_regression[col])

# 3Ô∏è‚É£ Training the logarithmic regression
X = df_regression[[f"log_{col}" if f"log_{col}" in df_regression.columns else col for col in features]]
y = df_regression["log_pm2_5"]

X = sm.add_constant(X)  # Add a constant for the intercept

model = sm.OLS(y, X).fit()

# 4Ô∏è‚É£ Display the results
print(model.summary())

# 5Ô∏è‚É£ Plot of actual vs. predicted values
plt.figure(figsize=(8, 5))
plt.scatter(y, model.predict(X), alpha=0.5)
plt.xlabel("Actual log(PM2.5)")
plt.ylabel("Predicted log(PM2.5)")
# plt.title("Actual vs. Predicted values")
plt.grid()
plt.savefig(os.path.join(output_dir, "Log_reg_predictions.png"), dpi=600, bbox_inches="tight")
plt.show()

# 6Ô∏è‚É£ Histogram of model residuals
residuals = y - model.predict(X)
plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel("Residuals")
# plt.title("Distribution of residuals")
plt.grid()
plt.savefig(os.path.join(output_dir, "Log_reg_residuals.png"), dpi=600, bbox_inches="tight")
plt.show()


In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 1Ô∏è‚É£ Data Preparation
df_regression = df_interpolated.copy()

# Keep only the necessary columns
features = ["air_temperature", "air_humidity", "P", "DD", "Ff", "VV"]
df_regression = df_regression[["pm2_5"] + features]

# Remove rows with missing values
df_regression = df_regression.dropna()

# Logarithmic transformation of PM2.5
df_regression["log_pm2_5"] = np.log1p(df_regression["pm2_5"])  # log(1 + x) to avoid log(0)

# 2Ô∏è‚É£ Logarithmic transformation of independent variables
# For wind speed, add a small constant before taking the logarithm
df_regression["log_Ff"] = np.log(df_regression["Ff"] + 0.1)  # Add 0.1 to handle possible zero values

# For the other variables, apply the standard transformation
for col in [c for c in features if c != "Ff"]:
    if (df_regression[col] > 0).all():  # For positive values only
        df_regression[f"log_{col}"] = np.log1p(df_regression[col])

# 3Ô∏è‚É£ Training the logarithmic regression with modified features
# Use log_Ff instead of Ff
modified_features = []
for col in features:
    if col == "Ff":
        modified_features.append("log_Ff")
    else:
        modified_features.append(f"log_{col}" if f"log_{col}" in df_regression.columns else col)

X = df_regression[modified_features]
y = df_regression["log_pm2_5"]

X = sm.add_constant(X)  # Add a constant for the intercept

model = sm.OLS(y, X).fit()

# 4Ô∏è‚É£ Display the results
print(model.summary())

# 5Ô∏è‚É£ Plot of actual vs. predicted values
plt.figure(figsize=(8, 5))
plt.scatter(y, model.predict(X), alpha=0.5)
plt.xlabel("Actual log(PM2.5)")
plt.ylabel("Predicted log(PM2.5)")
plt.grid()
# Create the output directory if it doesn't exist
os.makedirs("output_dir", exist_ok=True)
plt.savefig(os.path.join("output_dir", "Log_reg_predictions.png"), dpi=600, bbox_inches="tight")
plt.show()

# 6Ô∏è‚É£ Histogram of model residuals
residuals = y - model.predict(X)
plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel("Residuals")
plt.grid()
plt.savefig(os.path.join("output_dir", "Log_reg_residuals.png"), dpi=600, bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

# Load prepared data
df = pd.read_csv('df_data_prepared.csv', parse_dates=['date'])

# Create a figure with 4 subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
# fig.suptitle('Relationships between PM2.5 and Meteorological Variables', fontsize=16)

# (a) PM2.5 vs air temperature
ax = axs[0, 0]
x = df['air_temperature'].values
y = df['pm2_5'].values
mask = ~np.isnan(x) & ~np.isnan(y)
ax.scatter(x[mask], y[mask], alpha=0.5, color='blue', s=10)
# Trend line
slope, intercept, r_value, p_value, std_err = stats.linregress(x[mask], y[mask])
x_line = np.linspace(min(x[mask]), max(x[mask]), 100)
y_line = slope * x_line + intercept
ax.plot(x_line, y_line, color='red', lw=2)
ax.set_xlabel('Air Temperature (¬∞C)')
ax.set_ylabel('PM2.5 (¬µg/m¬≥)')
ax.set_title('(a) PM2.5 vs Air Temperature')
ax.grid(True, alpha=0.3)

# (b) PM2.5 vs relative humidity
ax = axs[0, 1]
x = df['air_humidity'].values
y = df['pm2_5'].values
mask = ~np.isnan(x) & ~np.isnan(y)
ax.scatter(x[mask], y[mask], alpha=0.5, color='blue', s=10)
# Trend line
slope, intercept, r_value, p_value, std_err = stats.linregress(x[mask], y[mask])
x_line = np.linspace(min(x[mask]), max(x[mask]), 100)
y_line = slope * x_line + intercept
ax.plot(x_line, y_line, color='red', lw=2)
ax.set_xlabel('Relative Humidity (%)')
ax.set_ylabel('PM2.5 (¬µg/m¬≥)')
ax.set_title('(b) PM2.5 vs Relative Humidity')
ax.grid(True, alpha=0.3)

# (c) PM2.5 vs wind speed (Ff)
ax = axs[1, 0]
x = df['Ff'].values
y = df['pm2_5'].values
mask = ~np.isnan(x) & ~np.isnan(y)
ax.scatter(x[mask], y[mask], alpha=0.5, color='blue', s=10)
# Trend line
slope, intercept, r_value, p_value, std_err = stats.linregress(x[mask], y[mask])
x_line = np.linspace(min(x[mask]), max(x[mask]), 100)
y_line = slope * x_line + intercept
ax.plot(x_line, y_line, color='red', lw=2)
ax.set_xlabel('Wind Speed (m/s)')
ax.set_ylabel('PM2.5 (¬µg/m¬≥)')
ax.set_title('(c) PM2.5 vs Wind Speed')
ax.grid(True, alpha=0.3)

# (d) PM2.5 vs visibility (VV)
ax = axs[1, 1]
x = df['VV'].values
y = df['pm2_5'].values
mask = ~np.isnan(x) & ~np.isnan(y)
ax.scatter(x[mask], y[mask], alpha=0.5, color='blue', s=10)
# Trend line
slope, intercept, r_value, p_value, std_err = stats.linregress(x[mask], y[mask])
x_line = np.linspace(min(x[mask]), max(x[mask]), 100)
y_line = slope * x_line + intercept
ax.plot(x_line, y_line, color='red', lw=2)
ax.set_xlabel('Visibility (km)')
ax.set_ylabel('PM2.5 (¬µg/m¬≥)')
ax.set_title('(d) PM2.5 vs Visibility')
ax.grid(True, alpha=0.3)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig(os.path.join(output_dir, 'pm25_meteo_relationships.png'), dpi=600, bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os

# Load data
df = pd.read_csv('df_data_prepared.csv', parse_dates=['date'])

# Load weather data for wind direction mapping
df_weather = pd.read_csv('weather_archive_utf8.csv', delimiter=";", encoding="utf-8")

# Create categorical variables
# For temperature: create bins from -25 to 40 with a step of 5
temp_bins = np.arange(-25, 41, 5)
temp_labels = [f"{temp_bins[i]} to {temp_bins[i+1]}" for i in range(len(temp_bins)-1)]
df['temp_category'] = pd.cut(df['air_temperature'], bins=temp_bins, labels=temp_labels)

# For humidity: create bins from 20 to 100 with a step of 10
humidity_bins = np.arange(20, 101, 10)
humidity_labels = [f"{humidity_bins[i]} to {humidity_bins[i+1]}" for i in range(len(humidity_bins)-1)]
df['humidity_category'] = pd.cut(df['air_humidity'], bins=humidity_bins, labels=humidity_labels)

# For wind speed: create bins from 0 to 18 with a step of 2
wind_bins = np.arange(0, 19, 2)
wind_labels = [f"{wind_bins[i]} to {wind_bins[i+1]}" for i in range(len(wind_bins)-1)]
df['wind_category'] = pd.cut(df['Ff'], bins=wind_bins, labels=wind_labels)

# Create wind direction mapping
wind_compass_mapping = {
    'Wind from the North': 'N', 
    'Wind from the North-Northeast': 'NNE',
    'Wind from the Northeast': 'NE', 
    'Wind from the East-Northeast': 'ENE',
    'Wind from the East': 'E', 
    'Wind from the East-Southeast': 'ESE',
    'Wind from the Southeast': 'SE', 
    'Wind from the South-Southeast': 'SSE',
    'Wind from the South': 'S', 
    'Wind from the South-Southwest': 'SSW',
    'Wind from the Southwest': 'SW', 
    'Wind from the West-Southwest': 'WSW',
    'Wind from the West': 'W', 
    'Wind from the West-Northwest': 'WNW',
    'Wind from the Northwest': 'NW', 
    'Wind from the North-Northwest': 'NNW',
    'Variable direction': 'Variable',
    'Calm, no wind': 'Calm'
}

# Inverted mapping
wind_reverse_mapping = {idx: wind_compass_mapping.get(direction, 'Unknown') 
                        for idx, direction in enumerate(sorted(df_weather["DD"].unique()))}

# Apply mapping to the data
df['wind_direction_compass'] = df['DD'].map(wind_reverse_mapping)

# Clockwise order of directions (excluding special states)
direction_order = ['N', 'NNE', 'NE', 'ENE', 
                   'E', 'ESE', 'SE', 'SSE', 
                   'S', 'SSW', 'SW', 'WSW', 
                   'W', 'WNW', 'NW', 'NNW']

# Filter data, including only standard directions
df_filtered = df[df['wind_direction_compass'].isin(direction_order)]

# Figure setup
fig, axs = plt.subplots(2, 2, figsize=(14, 12))
# fig.suptitle('Distribution of PM2.5 across Meteorological Parameters', fontsize=16)

# Calculate the upper limit with a small margin
y_max = df['pm2_5'].quantile(0.99)  # Increased to the 99th percentile
y_max_with_margin = y_max * 1.1  # Add a 10% margin

# (a) PM2.5 by temperature ranges
sns.boxplot(x='temp_category', y='pm2_5', data=df, ax=axs[0, 0], color='skyblue')
axs[0, 0].set_xlabel('Air Temperature Range (¬∞C)')
axs[0, 0].set_ylabel('PM2.5 (¬µg/m¬≥)')
axs[0, 0].set_title('(a) PM2.5 Distribution by Air Temperature')
axs[0, 0].tick_params(axis='x', rotation=45)
axs[0, 0].grid(True, linestyle='--', alpha=0.7)
axs[0, 0].set_ylim(0, y_max_with_margin)

# (b) PM2.5 by humidity ranges
sns.boxplot(x='humidity_category', y='pm2_5', data=df, ax=axs[0, 1], color='lightgreen')
axs[0, 1].set_xlabel('Relative Humidity Range (%)')
axs[0, 1].set_ylabel('PM2.5 (¬µg/m¬≥)')
axs[0, 1].set_title('(b) PM2.5 Distribution by Relative Humidity')
axs[0, 1].tick_params(axis='x', rotation=45)
axs[0, 1].grid(True, linestyle='--', alpha=0.7)
axs[0, 1].set_ylim(0, y_max_with_margin)

# (c) PM2.5 by wind speed ranges
sns.boxplot(x='wind_category', y='pm2_5', data=df, ax=axs[1, 0], color='lightsalmon')
axs[1, 0].set_xlabel('Wind Speed Range (m/s)')
axs[1, 0].set_ylabel('PM2.5 (¬µg/m¬≥)')
axs[1, 0].set_title('(c) PM2.5 Distribution by Wind Speed')
axs[1, 0].tick_params(axis='x', rotation=45)
axs[1, 0].grid(True, linestyle='--', alpha=0.7)
axs[1, 0].set_ylim(0, y_max_with_margin)

# (d) PM2.5 by wind directions
sns.boxplot(x='wind_direction_compass', y='pm2_5', data=df_filtered, 
            order=direction_order, ax=axs[1, 1], color='lightblue')
axs[1, 1].set_xlabel('Wind Direction')
axs[1, 1].set_ylabel('PM2.5 (¬µg/m¬≥)')
axs[1, 1].set_title('(d) PM2.5 Distribution by Wind Direction')
axs[1, 1].tick_params(axis='x', rotation=45)
axs[1, 1].grid(True, linestyle='--', alpha=0.7)
axs[1, 1].set_ylim(0, y_max_with_margin)

plt.tight_layout(rect=[0, 0, 1, 0.95])

# Create output directory if it doesn't exist
output_dir = 'output_diagrams'
os.makedirs(output_dir, exist_ok=True)

plt.savefig(os.path.join(output_dir, 'pm25_distribution_by_meteo_extended.png'), dpi=600, bbox_inches='tight')
plt.show()

# Additional statistical analysis of wind directions
from scipy import stats

# One-way analysis of variance (ANOVA)
wind_groups = [group['pm2_5'].dropna() for name, group in df_filtered.groupby('wind_direction_compass') if len(group['pm2_5'].dropna()) > 0]

if len(wind_groups) >= 2:
    f_statistic, p_value = stats.f_oneway(*wind_groups)
    print("\nWind Direction ANOVA Results:")
    print(f"F-statistic: {f_statistic}")
    print(f"p-value: {p_value}")

# Output statistics by wind direction
print("\nWind Direction PM2.5 Statistics:")
wind_stats = df_filtered.groupby('wind_direction_compass')['pm2_5'].agg(['count', 'mean', 'std', 'min', 'max'])
print(wind_stats)

In [None]:
import pandas as pd
import numpy as np

# File path
file_path = "weather_archive_utf8.csv"

# Select only the necessary columns (remove "c")
columns_to_import = ["time", "T", "P0", "P", "U", "DD", "Ff", "VV"]

# Import data
df_weather = pd.read_csv(file_path, delimiter=";", usecols=columns_to_import, encoding="utf-8")

# Unique wind directions before transformation
print("Unique wind directions in the source file:")
print(df_weather["DD"].unique())

# Create wind direction mapping
wind_mapping = {direction: idx for idx, direction in enumerate(sorted(df_weather["DD"].unique()))}
reverse_wind_mapping = {idx: direction for direction, idx in wind_mapping.items()}

print("\nWind direction mapping:")
for idx, direction in reverse_wind_mapping.items():
    print(f"{idx}: {direction}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from scipy import stats

# Load data
df = pd.read_csv('df_data_prepared.csv', parse_dates=['date'])

# Load weather data for wind direction mapping
df_weather = pd.read_csv('weather_archive_utf8.csv', delimiter=";", encoding="utf-8")

# Create a precise mapping for wind directions
wind_compass_mapping = {
    'Wind from the North': 'N', 
    'Wind from the North-Northeast': 'NNE',
    'Wind from the Northeast': 'NE', 
    'Wind from the East-Northeast': 'ENE',
    'Wind from the East': 'E', 
    'Wind from the East-Southeast': 'ESE',
    'Wind from the Southeast': 'SE', 
    'Wind from the South-Southeast': 'SSE',
    'Wind from the South': 'S', 
    'Wind from the South-Southwest': 'SSW',
    'Wind from the Southwest': 'SW', 
    'Wind from the West-Southwest': 'WSW',
    'Wind from the West': 'W', 
    'Wind from the West-Northwest': 'WNW',
    'Wind from the Northwest': 'NW', 
    'Wind from the North-Northwest': 'NNW',
    'Variable direction': 'Variable',
    'Calm, no wind': 'Calm'
}

# Inverted mapping
wind_reverse_mapping = {idx: wind_compass_mapping.get(direction, 'Unknown') 
                        for idx, direction in enumerate(sorted(df_weather["DD"].unique()))}

# Apply mapping to the data
df['wind_direction_compass'] = df['DD'].map(wind_reverse_mapping)

# Clockwise order of directions (excluding special states)
direction_order = ['N', 'NNE', 'NE', 'ENE', 
                   'E', 'ESE', 'SE', 'SSE', 
                   'S', 'SSW', 'SW', 'WSW', 
                   'W', 'WNW', 'NW', 'NNW']

# Filter data, excluding special states
df_filtered = df[df['wind_direction_compass'].isin(direction_order)]

# Create a figure for visualization
plt.figure(figsize=(16, 10))

# Data preparation
wind_pm25_summary = df_filtered.groupby('wind_direction_compass')['pm2_5'].agg(['count', 'mean', 'std'])
wind_pm25_summary['ci'] = 1.96 * (wind_pm25_summary['std'] / np.sqrt(wind_pm25_summary['count']))
wind_pm25_summary = wind_pm25_summary.reindex(direction_order)

# Building a boxplot
plt.subplot(2, 1, 1)
sns.boxplot(x='wind_direction_compass', y='pm2_5', data=df_filtered, 
            order=direction_order)
plt.title('PM2.5 Distribution by Wind Direction (Boxplot)')
plt.xlabel('Wind Direction')
plt.ylabel('PM2.5 (¬µg/m¬≥)')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.7)

# Building a bar chart of mean values with confidence intervals
plt.subplot(2, 1, 2)
plt.bar(direction_order, wind_pm25_summary['mean'], 
        yerr=wind_pm25_summary['ci'], 
        capsize=5, 
        color='skyblue', 
        edgecolor='navy')
plt.title('Average PM2.5 by Wind Direction with 95% Confidence Intervals')
plt.xlabel('Wind Direction')
plt.ylabel('Average PM2.5 (¬µg/m¬≥)')
plt.xticks(rotation=45)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('wind_direction_pm25_analysis.png', dpi=600)

# Output statistics
print("PM2.5 Statistics by Wind Direction:")
print(wind_pm25_summary)

# Prepare data for ANOVA
wind_groups = [group['pm2_5'].dropna() for name, group in df_filtered.groupby('wind_direction_compass') if len(group['pm2_5'].dropna()) > 0]

# Check for a sufficient number of groups
if len(wind_groups) >= 2:
    # One-way analysis of variance (ANOVA)
    f_statistic, p_value = stats.f_oneway(*wind_groups)

    print("\nOne-way ANOVA results:")
    print(f"F-statistic: {f_statistic}")
    print(f"p-value: {p_value}")
else:
    print("\nNot enough groups for ANOVA analysis")

# Visual check of distribution by direction
plt.figure(figsize=(10, 6))
wind_pm25_summary[['mean', 'count']].plot(kind='bar', secondary_y='count')
plt.title('Mean PM2.5 and Sample Count by Wind Direction')
plt.xlabel('Wind Direction')
plt.ylabel('Mean PM2.5 (¬µg/m¬≥)')
plt.legend(['Mean PM2.5', 'Sample Count'])
plt.tight_layout()
plt.savefig('wind_direction_sample_count.png', dpi=600)

# Additional information on direction distribution
print("\nWind Direction Distribution:")
print(df['wind_direction_compass'].value_counts())

## Analysis of non-linear dependencies using a Decision Tree

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# üìå Load data (if not already loaded)
df_model_data = df_interpolated.copy()

# üìå Select variables for analysis
features = ["air_temperature", "air_humidity", "T", "P0", "P", "U", "DD", "Ff", "VV"]
target = "pm2_5"

# üìå Remove rows with missing values in the selected columns
df_model_data = df_model_data.dropna(subset=[target] + features)

# üìå Split data into training and testing sets (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(df_model_data[features], df_model_data[target], test_size=0.2, random_state=42)

# üìå Find optimal model parameters using GridSearchCV
param_grid = {"max_depth": [3, 5, 10, 15], "min_samples_split": [2, 5, 10]}
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring="r2")
grid_search.fit(X_train, y_train)

# üìå Best decision tree model
best_tree = grid_search.best_estimator_

# üìå Make a prediction
y_pred = best_tree.predict(X_test)

# üìå Evaluate the model's quality
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"üîπ MAE: {mae:.2f}")
print(f"üîπ RMSE: {rmse:.2f}")
print(f"üîπ R¬≤: {r2:.3f}")

# üìå Visualization of predicted vs. actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--", color="red")  # Ideal prediction line
plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Actual vs Predicted values (Decision Tree)")
plt.grid()
plt.show()

# üìå Feature importance
feature_importance = pd.Series(best_tree.feature_importances_, index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 5))
feature_importance.plot(kind="bar", color="royalblue")
plt.title("Feature Importance (Decision Tree)")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.grid()
plt.show()


## Checking Gradient Boosting for finding correlations with PM2.5

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# üìå Load data (assuming df_interpolated is already prepared)
features = ['T', 'P0', 'P', 'U', 'DD', 'Ff', 'VV', 'air_temperature', 'air_humidity']
target = 'pm2_5'

# Remove missing values
filtered_pm25_data = df_interpolated.dropna(subset=[target] + features)

# Split data into features (X) and target variable (y)
X = filtered_pm25_data[features]
y = filtered_pm25_data[target]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# üìå Train the Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# üìå Model evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'üîπ MAE: {mae:.2f}')
print(f'üîπ RMSE: {rmse:.2f}')
print(f'üîπ R¬≤: {r2:.3f}')

# üìå Visualization of feature importance
feature_importance = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 5))
plt.bar(feature_names, feature_importance, color='blue', alpha=0.7)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importance (Gradient Boosting)")
plt.xticks(rotation=45)
plt.show()

# üìå Visualization of predicted vs. actual values
plt.figure(figsize=(7, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line of ideal prediction
plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Actual vs Predicted values (Gradient Boosting)")
plt.show()


## Checking Random Forest for finding correlations with PM2.5

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# üìå Load data (assuming df_interpolated is already prepared)
features = ['T', 'P0', 'P', 'U', 'DD', 'Ff', 'VV', 'air_temperature', 'air_humidity']
target = 'pm2_5'

# Remove missing values
filtered_pm25_data = df_interpolated.dropna(subset=[target] + features)

# Split data into features (X) and target variable (y)
X = filtered_pm25_data[features]
y = filtered_pm25_data[target]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# üìå Train the Random Forest model
model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# üìå Model evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'üîπ MAE: {mae:.2f}')
print(f'üîπ RMSE: {rmse:.2f}')
print(f'üîπ R¬≤: {r2:.3f}')

# üìå Visualization of feature importance
feature_importance = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 5))
plt.bar(feature_names, feature_importance, color='blue', alpha=0.7)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importance (Random Forest)")
plt.xticks(rotation=45)
plt.show()

# üìå Visualization of predicted vs. actual values
plt.figure(figsize=(7, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line of ideal prediction
plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Actual vs Predicted values (Random Forest)")
plt.show()


## More complex approximations of non-linear dependencies. A neural network based on TensorFlow/Keras. MLP

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

# üìå Data loading and preprocessing
features = ['T', 'P0', 'P', 'U', 'DD', 'Ff', 'VV', 'air_temperature', 'air_humidity']
target = 'pm2_5'

# Remove missing values
filtered_pm25_data = df_interpolated.dropna(subset=[target] + features)

# Split into features (X) and target variable (y)
X = filtered_pm25_data[features]
y = filtered_pm25_data[target]

# Data normalization (very important for neural networks!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# üìå Create the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)  # Output layer for PM2.5 prediction
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# üìå Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, verbose=1)

# üìå Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f'üîπ MAE: {mae:.2f}')
print(f'üîπ RMSE: {np.sqrt(loss):.2f}')

# üìå Visualize the training process
plt.figure(figsize=(10, 5))
plt.plot(history.history['mae'], label='MAE (Train)')
plt.plot(history.history['val_mae'], label='MAE (Validation)')
plt.xlabel("Epochs")
plt.ylabel("MAE")
plt.title("Model Training Process")
plt.legend()
plt.show()

# üìå Visualization of predicted vs. actual values
y_pred = model.predict(X_test)

plt.figure(figsize=(7, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line of ideal prediction
plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Actual vs Predicted values (Neural Network)")
plt.show()


### Using GPU in TensorFlow

This code checks for the availability of physical GPU devices for use in TensorFlow.

In [None]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))


## Correlation analysis between PM2.5 and temporal parameters

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming the df_interpolated DataFrame is already loaded and contains "date" and "pm2_5" columns
df_corr_an_pm_time = df_interpolated.copy()

# Convert the 'date' column to datetime type (if not already converted)
df_corr_an_pm_time['date'] = pd.to_datetime(df_corr_an_pm_time['date'], errors='coerce')

# Extract temporal features
df_corr_an_pm_time['month'] = df_corr_an_pm_time['date'].dt.month
df_corr_an_pm_time['day'] = df_corr_an_pm_time['date'].dt.day
df_corr_an_pm_time['hour'] = df_corr_an_pm_time['date'].dt.hour
df_corr_an_pm_time['dayofweek'] = df_corr_an_pm_time['date'].dt.dayofweek
# Simple seasonal feature: 1 ‚Äì winter, 2 ‚Äì spring, 3 ‚Äì summer, 4 ‚Äì autumn
df_corr_an_pm_time['season'] = df_corr_an_pm_time['month'] % 12 // 3 + 1

# Select columns for correlation analysis
cols = ['pm2_5', 'month', 'day', 'hour', 'dayofweek', 'season']
corr_matrix = df_corr_an_pm_time[cols].corr(method='pearson')

# Display the correlation matrix
print("Pearson Correlation Matrix:")
print(corr_matrix)

# Visualization using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Pearson Correlation Matrix: PM2.5 and Temporal Features")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats

# Adjust font sizes
plt.rcParams.update({
    'font.size': 14,  # Increase base font size
    'axes.titlesize': 16,  # Plot title size
    'axes.labelsize': 14,  # Axis label size
    'xtick.labelsize': 14,  # X-axis tick label size
    'ytick.labelsize': 14   # Y-axis tick label size
})

# Create a directory to save results if it doesn't exist
output_dir = 'output_diagrams'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load data
df = pd.read_csv("df_data_prepared.csv")
df['date'] = pd.to_datetime(df['date'])

# Extract temporal features
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['month'] = df['date'].dt.month
df['season'] = (df['date'].dt.month % 12 // 3 + 1)  # 1-Winter, 2-Spring, 3-Summer, 4-Autumn

# Select features for correlation analysis
features = ['pm2_5', 'hour', 'day_of_week', 'month', 'season']
df_corr = df[features].dropna()

# Calculate Pearson and Spearman correlations
pearson_corr = df_corr.corr(method='pearson')
spearman_corr = df_corr.corr(method='spearman')

# Function to get p-value for Pearson correlation
def get_pearson_p_value(x, y):
    return stats.pearsonr(x, y)[1]

# Function to get p-value for Spearman correlation
def get_spearman_p_value(x, y):
    return stats.spearmanr(x, y)[1]

# Calculate p-values for correlations
pearson_p_values = pd.DataFrame(np.zeros((len(features), len(features))), 
                               index=features, columns=features)
spearman_p_values = pd.DataFrame(np.zeros((len(features), len(features))), 
                                index=features, columns=features)

for i, feat1 in enumerate(features):
    for j, feat2 in enumerate(features):
        if i != j:  # Avoid diagonal elements (correlation of a variable with itself)
            data1 = df_corr[feat1].values
            data2 = df_corr[feat2].values
            pearson_p_values.iloc[i, j] = get_pearson_p_value(data1, data2)
            spearman_p_values.iloc[i, j] = get_spearman_p_value(data1, data2)

# Function to get asterisks based on p-values
def get_stars(p_val):
    if np.isnan(p_val):
        return ""
    if p_val < 0.001:
        return "***"
    elif p_val < 0.01:
        return "**"
    elif p_val < 0.05:
        return "*"
    return ""

# Create annotations with asterisks below the values
pearson_annot = np.empty_like(pearson_corr.values, dtype=object)
spearman_annot = np.empty_like(spearman_corr.values, dtype=object)

for i in range(len(features)):
    for j in range(len(features)):
        if i == j:  # Diagonal elements (correlation of a variable with itself)
            pearson_annot[i, j] = f"{pearson_corr.iloc[i, j]:.2f}"
            spearman_annot[i, j] = f"{spearman_corr.iloc[i, j]:.2f}"
        else:
            stars_p = get_stars(pearson_p_values.iloc[i, j])
            stars_s = get_stars(spearman_p_values.iloc[i, j])
            
            pearson_annot[i, j] = f"{pearson_corr.iloc[i, j]:.2f}\n{stars_p}"
            spearman_annot[i, j] = f"{spearman_corr.iloc[i, j]:.2f}\n{stars_s}"

# Create a matrix for combined display
combined_annot = pearson_annot.copy()
combined_corr = pearson_corr.copy()

# Fill the upper triangle with Spearman values
mask_upper = np.triu_indices(len(features), k=1)
for i, j in zip(*mask_upper):
    combined_corr.iloc[i, j] = spearman_corr.iloc[i, j]
    combined_annot[i, j] = spearman_annot[i, j]

# Create the figure
plt.figure(figsize=(10, 8))

# Create a mask - show the entire matrix
mask = np.zeros_like(combined_corr, dtype=bool)

# Build a heatmap with multi-line annotations
sns.heatmap(combined_corr, annot=combined_annot, fmt='', cmap='coolwarm',
            vmin=-1, vmax=1, mask=mask, cbar_kws={'label': 'Correlation coefficient'})

# Create more readable labels
feature_labels = ['PM2.5', 'Hour', 'Day of Week', 'Month', 'Season']
plt.xticks(np.arange(len(feature_labels))+0.5, feature_labels, rotation=45)
plt.yticks(np.arange(len(feature_labels))+0.5, feature_labels)

# plt.title('Correlation between PM2.5 and Temporal Features', fontsize=14)

# Add explanatory captions
plt.figtext(0.05, 0.00, 'Pearson (lower triangle) / Spearman (upper triangle)', fontsize=12)
plt.figtext(0.65, 0.00, '* p<0.05, ** p<0.01, *** p<0.001', fontsize=12)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'temporal_correlation_heatmap.png'), dpi=600, bbox_inches='tight')
plt.show()

## Analysis of PM2.5 trends by temporal features with confidence intervals

- Function to calculate the 95% confidence interval for a data series.
- Grouping data by hour, month, and day of the week, calculating the mean, standard error, and confidence intervals.
- Visualization of PM2.5 trends by hour, month, and day of the week with confidence intervals displayed.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

# Create output directory if it doesn't exist
output_dir = 'output_diagrams'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load data (if not already loaded)
df = pd.read_csv("df_data_prepared.csv")
df['date'] = pd.to_datetime(df['date'])

# Extract temporal features
df['hour'] = df['date'].dt.hour
df['month'] = df['date'].dt.month
df['dayofweek'] = df['date'].dt.dayofweek  # 0 = Monday, 6 = Sunday

# Create DataFrame for correlation analysis
df_corr_an_pm_time = df[['pm2_5', 'hour', 'month', 'dayofweek']].dropna()

# Function to calculate the 95% confidence interval for a data series
def compute_ci(series, confidence=0.95):
    n = len(series)
    mean = np.mean(series)
    std_err = stats.sem(series)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean, mean - h, mean + h

# Group by hour
hour_stats = df_corr_an_pm_time.groupby('hour')['pm2_5'].agg(['mean', 'count', 'std']).reset_index()
hour_stats['sem'] = hour_stats['std'] / np.sqrt(hour_stats['count'])
hour_stats['ci_lower'] = hour_stats.apply(lambda row: row['mean'] - row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)
hour_stats['ci_upper'] = hour_stats.apply(lambda row: row['mean'] + row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)

# Group by month
month_stats = df_corr_an_pm_time.groupby('month')['pm2_5'].agg(['mean', 'count', 'std']).reset_index()
month_stats['sem'] = month_stats['std'] / np.sqrt(month_stats['count'])
month_stats['ci_lower'] = month_stats.apply(lambda row: row['mean'] - row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)
month_stats['ci_upper'] = month_stats.apply(lambda row: row['mean'] + row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)

# Group by day of week
day_stats = df_corr_an_pm_time.groupby('dayofweek')['pm2_5'].agg(['mean', 'count', 'std']).reset_index()
day_stats['sem'] = day_stats['std'] / np.sqrt(day_stats['count'])
day_stats['ci_lower'] = day_stats.apply(lambda row: row['mean'] - row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)
day_stats['ci_upper'] = day_stats.apply(lambda row: row['mean'] + row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)

# Improve plot style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11
})

# Visualization of trend by hour, month, and day of week
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Trend by hour (0-23)
axes[0].errorbar(hour_stats['hour'], hour_stats['mean'],
                yerr=[hour_stats['mean'] - hour_stats['ci_lower'], hour_stats['ci_upper'] - hour_stats['mean']],
                fmt='-o', capsize=5, color='#1f77b4')
axes[0].set_xlabel("Hour of Day")
axes[0].set_ylabel("Mean PM2.5 (¬µg/m¬≥)")
axes[0].set_title("PM2.5 Trend by Hour of Day")
axes[0].set_xticks(range(0, 24, 2))

# Trend by month (1-12)
axes[1].errorbar(month_stats['month'], month_stats['mean'],
                yerr=[month_stats['mean'] - month_stats['ci_lower'], month_stats['ci_upper'] - month_stats['mean']],
                fmt='-o', capsize=5, color='#1f77b4')
axes[1].set_xlabel("Month")
axes[1].set_ylabel("Mean PM2.5 (¬µg/m¬≥)")
axes[1].set_title("PM2.5 Trend by Month")
axes[1].set_xticks(range(1, 13))
axes[1].set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

# Trend by day of week (0 = Monday, 6 = Sunday)
axes[2].errorbar(day_stats['dayofweek'], day_stats['mean'],
                yerr=[day_stats['mean'] - day_stats['ci_lower'], day_stats['ci_upper'] - day_stats['mean']],
                fmt='-o', capsize=5, color='#1f77b4')
axes[2].set_xlabel("Day of Week")
axes[2].set_ylabel("Mean PM2.5 (¬µg/m¬≥)")
axes[2].set_title("PM2.5 Trend by Day of Week")
axes[2].set_xticks(range(0, 7))
axes[2].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

# Add a figure-level caption
# fig.text(0.5, 0.01, 'Figure 5. Temporal trends in PM2.5 concentrations with 95% confidence intervals', ha='center', fontsize=14)

plt.tight_layout(rect=[0, 0.03, 1, 1])
plt.savefig(os.path.join(output_dir, 'temporal_trends_PM25.png'), dpi=600, bbox_inches='tight')
plt.show()

print(f"Figure saved to {os.path.join(output_dir, 'temporal_trends_PM25.png')}")



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
from matplotlib.ticker import MultipleLocator

# Create output directory if it doesn't exist
output_dir = 'output_diagrams'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load data (if not already loaded)
df = pd.read_csv("df_data_prepared.csv")
df['date'] = pd.to_datetime(df['date'])

# Extract temporal features
df['hour'] = df['date'].dt.hour
df['month'] = df['date'].dt.month
df['dayofweek'] = df['date'].dt.dayofweek  # 0 = Monday, 6 = Sunday

# Create DataFrame for correlation analysis
df_corr_an_pm_time = df[['pm2_5', 'hour', 'month', 'dayofweek']].dropna()

# Function to calculate the 95% confidence interval for a data series
def compute_ci(series, confidence=0.95):
    n = len(series)
    mean = np.mean(series)
    std_err = stats.sem(series)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean, mean - h, mean + h

# Group by hour
hour_stats = df_corr_an_pm_time.groupby('hour')['pm2_5'].agg(['mean', 'count', 'std']).reset_index()
hour_stats['sem'] = hour_stats['std'] / np.sqrt(hour_stats['count'])
hour_stats['ci_lower'] = hour_stats.apply(lambda row: row['mean'] - row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)
hour_stats['ci_upper'] = hour_stats.apply(lambda row: row['mean'] + row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)

# Group by month
month_stats = df_corr_an_pm_time.groupby('month')['pm2_5'].agg(['mean', 'count', 'std']).reset_index()
month_stats['sem'] = month_stats['std'] / np.sqrt(month_stats['count'])
month_stats['ci_lower'] = month_stats.apply(lambda row: row['mean'] - row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)
month_stats['ci_upper'] = month_stats.apply(lambda row: row['mean'] + row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)

# Group by day of week
day_stats = df_corr_an_pm_time.groupby('dayofweek')['pm2_5'].agg(['mean', 'count', 'std']).reset_index()
day_stats['sem'] = day_stats['std'] / np.sqrt(day_stats['count'])
day_stats['ci_lower'] = day_stats.apply(lambda row: row['mean'] - row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)
day_stats['ci_upper'] = day_stats.apply(lambda row: row['mean'] + row['sem'] * stats.t.ppf(0.975, row['count'] - 1), axis=1)

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams.update({
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11
})

# Create a figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Configure colors
main_color = '#2077b4'
fill_color = '#75b3e0'

# 1. Trend by hour of the day with a filled area under the plot
axes[0].errorbar(hour_stats['hour'], hour_stats['mean'],
                yerr=[hour_stats['mean'] - hour_stats['ci_lower'], hour_stats['ci_upper'] - hour_stats['mean']],
                fmt='o', capsize=5, color=main_color, markersize=8, ecolor='gray', elinewidth=1)
axes[0].plot(hour_stats['hour'], hour_stats['mean'], color=main_color, alpha=0.8)
axes[0].fill_between(hour_stats['hour'], hour_stats['ci_lower'], hour_stats['ci_upper'], color=fill_color, alpha=0.3)
axes[0].set_xlabel("Hour of Day")
axes[0].set_ylabel("Mean PM2.5 (¬µg/m¬≥)")
axes[0].set_title("(a) PM2.5 Trend by Hour of Day")
axes[0].set_xticks(range(0, 24, 3))  # show every 3 hours
axes[0].xaxis.set_minor_locator(MultipleLocator(1))  # add minor ticks
axes[0].grid(which='minor', alpha=0.2)
axes[0].grid(which='major', alpha=0.5)

# 2. Trend by month, accounting for missing data in February, March, April
# Define groups of months for which there is data and which should be connected by lines
month_groups = [
    [1],              # January by itself
    [5, 6, 7, 8, 9, 10, 11, 12]  # May - December
]

# Draw points for all months
axes[1].errorbar(month_stats['month'], month_stats['mean'],
                yerr=[month_stats['mean'] - month_stats['ci_lower'], month_stats['ci_upper'] - month_stats['mean']],
                fmt='o', capsize=5, color=main_color, markersize=8, ecolor='gray', elinewidth=1, zorder=5)

# Connect with lines and fill areas only for groups of consecutive months
for group in month_groups:
    group_months = month_stats[month_stats['month'].isin(group)]
    if len(group) > 1:  # connect with a line only if there is more than one month in the group
        axes[1].plot(group_months['month'], group_months['mean'], color=main_color, alpha=0.8, zorder=4)
        axes[1].fill_between(group_months['month'], group_months['ci_lower'], group_months['ci_upper'], 
                            color=fill_color, alpha=0.3, zorder=3)

axes[1].set_xlabel("Month")
axes[1].set_ylabel("Mean PM2.5 (¬µg/m¬≥)")
axes[1].set_title("(b) PM2.5 Trend by Month")
axes[1].set_xticks(range(1, 13))
axes[1].set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

# Add an annotation for months with no data
for month in [2, 3, 4]:
    axes[1].annotate('n.d.', xy=(month, axes[1].get_ylim()[0] + 5),
                   xytext=(0, 5), textcoords='offset points',
                   ha='center', fontsize=12, color='gray')

# 3. Trend by day of week
axes[2].errorbar(day_stats['dayofweek'], day_stats['mean'],
                yerr=[day_stats['mean'] - day_stats['ci_lower'], day_stats['ci_upper'] - day_stats['mean']],
                fmt='o', capsize=5, color=main_color, markersize=8, ecolor='gray', elinewidth=1)
axes[2].plot(day_stats['dayofweek'], day_stats['mean'], color=main_color, alpha=0.8)
axes[2].fill_between(day_stats['dayofweek'], day_stats['ci_lower'], day_stats['ci_upper'], color=fill_color, alpha=0.3)
axes[2].set_xlabel("Day of Week")
axes[2].set_ylabel("Mean PM2.5 (¬µg/m¬≥)")
axes[2].set_title("(c) PM2.5 Trend by Day of Week")
axes[2].set_xticks(range(0, 7))
axes[2].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

# Add a general title/caption
# fig.suptitle('Temporal Patterns in PM2.5 Concentrations', fontsize=16, y=1.02)
# fig.text(0.5, 0.01, 'Figure 5. Mean PM2.5 concentrations with 95% confidence intervals by hour of day, month, and day of week.', 
        #  ha='center', fontsize=12)

plt.tight_layout(rect=[0, 0.03, 1, 0.98])
plt.savefig(os.path.join(output_dir, 'temporal_trends_PM25.png'), dpi=600, bbox_inches='tight')
plt.show()

print(f"Figure saved to {os.path.join(output_dir, 'temporal_trends_PM25.png')}")

## Correlation analysis between PM2.5 and temporal features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming the df_interpolated DataFrame is already loaded and contains "date" and "pm2_5" columns
# Create a copy with a unique name
df_corr_an_pm_time = df_interpolated.copy()

# Convert the 'date' column to datetime if it is not already in that format
df_corr_an_pm_time['date'] = pd.to_datetime(df_corr_an_pm_time['date'], errors='coerce')

# Extract temporal features
df_corr_an_pm_time['month'] = df_corr_an_pm_time['date'].dt.month
df_corr_an_pm_time['day'] = df_corr_an_pm_time['date'].dt.day
df_corr_an_pm_time['hour'] = df_corr_an_pm_time['date'].dt.hour
df_corr_an_pm_time['dayofweek'] = df_corr_an_pm_time['date'].dt.dayofweek
# Simple seasonal feature: 1 ‚Äì winter, 2 ‚Äì spring, 3 ‚Äì summer, 4 ‚Äì autumn
df_corr_an_pm_time['season'] = df_corr_an_pm_time['month'] % 12 // 3 + 1

# Select columns for analysis
cols = ['pm2_5','month', 'day', 'hour', 'dayofweek', 'season']

# Calculate the Spearman correlation matrix
spearman_corr = df_corr_an_pm_time[cols].corr(method='spearman')
print("Spearman Correlation Matrix:")
print(spearman_corr)

# Visualization using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Spearman Correlation Matrix: PM2.5 and Temporal Features")
plt.show()


## Logarithmic regression for predicting PM2.5 based on temporal features and analysis of model residuals

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# 1Ô∏è‚É£ Data preparation from df_corr_an_pm_time
# Assuming the df_corr_an_pm_time DataFrame is already loaded and contains "date" and "pm2_5" columns
df_corr_an_pm_time['date'] = pd.to_datetime(df_corr_an_pm_time['date'], errors='coerce')

# Extract temporal features
df_corr_an_pm_time['year'] = df_corr_an_pm_time['date'].dt.year
df_corr_an_pm_time['month'] = df_corr_an_pm_time['date'].dt.month
df_corr_an_pm_time['day'] = df_corr_an_pm_time['date'].dt.day
df_corr_an_pm_time['hour'] = df_corr_an_pm_time['date'].dt.hour
df_corr_an_pm_time['dayofweek'] = df_corr_an_pm_time['date'].dt.dayofweek
# Simple seasonal feature: 1 ‚Äì winter, 2 ‚Äì spring, 3 ‚Äì summer, 4 ‚Äì autumn
df_corr_an_pm_time['season'] = df_corr_an_pm_time['month'] % 12 // 3 + 1

# Keep only the necessary columns: PM2.5 and temporal features
features = ['year', 'month', 'day', 'hour', 'dayofweek', 'season']
df_regression = df_corr_an_pm_time[['pm2_5'] + features].dropna()

# 2Ô∏è‚É£ Logarithmic transformation of the target feature PM2.5
df_regression["log_pm2_5"] = np.log1p(df_regression["pm2_5"])  # log(1 + PM2.5)

# 3Ô∏è‚É£ Logarithmic transformation of temporal features
# Use log1p for all temporal features (this is acceptable if values are >= 0)
for col in features:
    df_regression[f"log_{col}"] = np.log1p(df_regression[col])

# 4Ô∏è‚É£ Training the logarithmic regression
# Independent variables: log-transformed temporal features
X = df_regression[[f"log_{col}" for col in features]]
y = df_regression["log_pm2_5"]

X = sm.add_constant(X)  # Add a constant for the intercept
model = sm.OLS(y, X).fit()

# 5Ô∏è‚É£ Display model results
print(model.summary())

# 6Ô∏è‚É£ Plot of actual vs. predicted values
plt.figure(figsize=(8, 5))
plt.scatter(y, model.predict(X), alpha=0.5)
plt.xlabel("Actual log(PM2.5)")
plt.ylabel("Predicted log(PM2.5)")
plt.title("Actual vs Predicted values (temporal features)")
plt.grid()
plt.show()

# 7Ô∏è‚É£ Histogram of model residuals
residuals = y - model.predict(X)
plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel("Model Residuals")
plt.title("Distribution of residuals (temporal features)")
plt.grid()
plt.show()



## Analysis of temporal data and PM2.5 prediction using a tuned Decision Tree Regressor model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns

# 1Ô∏è‚É£ Load data and create a copy with a unique name
df_model_data = df_interpolated.copy()

# Convert the 'date' column to datetime (if it is not already in datetime format)
df_model_data['date'] = pd.to_datetime(df_model_data['date'], errors='coerce')

# 2Ô∏è‚É£ Extract temporal features
df_model_data['year'] = df_model_data['date'].dt.year
df_model_data['month'] = df_model_data['date'].dt.month
df_model_data['day'] = df_model_data['date'].dt.day
df_model_data['hour'] = df_model_data['date'].dt.hour
df_model_data['dayofweek'] = df_model_data['date'].dt.dayofweek
# Simple seasonal feature: 1 ‚Äì winter, 2 ‚Äì spring, 3 ‚Äì summer, 4 ‚Äì autumn
df_model_data['season'] = df_model_data['month'] % 12 // 3 + 1

# 3Ô∏è‚É£ Select variables for analysis: target feature and temporal features
features = ['year', 'month', 'day', 'hour', 'dayofweek', 'season']
target = 'pm2_5'

# Remove rows with missing values in the selected columns
df_model_data = df_model_data.dropna(subset=[target] + features)

# 4Ô∏è‚É£ Split data into training and testing sets (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(df_model_data[features], 
                                                    df_model_data[target], 
                                                    test_size=0.2, 
                                                    random_state=42)

# 5Ô∏è‚É£ Find optimal model parameters using GridSearchCV
param_grid = {"max_depth": [3, 5, 10, 15], "min_samples_split": [2, 5, 10]}
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), 
                           param_grid, 
                           cv=5, 
                           scoring="r2")
grid_search.fit(X_train, y_train)

# 6Ô∏è‚É£ Best decision tree model
best_tree = grid_search.best_estimator_

# 7Ô∏è‚É£ Make a prediction
y_pred = best_tree.predict(X_test)

# 8Ô∏è‚É£ Evaluate the model's quality
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"üîπ MAE: {mae:.2f}")
print(f"üîπ RMSE: {rmse:.2f}")
print(f"üîπ R¬≤: {r2:.3f}")

# 9Ô∏è‚É£ Visualization of predicted vs. actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--", color="red")  # Ideal prediction line
plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Actual vs Predicted values (Decision Tree on temporal features)")
plt.grid()
plt.show()

# 10Ô∏è‚É£ Feature importance
feature_importance = pd.Series(best_tree.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
feature_importance.plot(kind="bar", color="royalblue")
plt.title("Feature Importance (Decision Tree on temporal features)")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.grid()
plt.show()


## Analysis of temporal features and PM2.5 level prediction using Gradient Boosting Regressor

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns

# 1Ô∏è‚É£ Load data and create a copy with a unique name
df_model_time = df_interpolated.copy()

# 2Ô∏è‚É£ Convert the 'date' column to datetime (if not already converted)
df_model_time['date'] = pd.to_datetime(df_model_time['date'], errors='coerce')

# 3Ô∏è‚É£ Extract temporal features
df_model_time['year'] = df_model_time['date'].dt.year
df_model_time['month'] = df_model_time['date'].dt.month
df_model_time['day'] = df_model_time['date'].dt.day
df_model_time['hour'] = df_model_time['date'].dt.hour
df_model_time['dayofweek'] = df_model_time['date'].dt.dayofweek
# Simple seasonal feature: 1 ‚Äì winter, 2 ‚Äì spring, 3 ‚Äì summer, 4 ‚Äì autumn
df_model_time['season'] = df_model_time['month'] % 12 // 3 + 1

# 4Ô∏è‚É£ Select variables for analysis: temporal features
features = ['year', 'month', 'day', 'hour', 'dayofweek', 'season']
target = 'pm2_5'

# 5Ô∏è‚É£ Remove rows with missing values in the selected columns
df_model_time = df_model_time.dropna(subset=[target] + features)

# 6Ô∏è‚É£ Split data into training and testing sets (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(df_model_time[features], 
                                                    df_model_time[target], 
                                                    test_size=0.2, 
                                                    random_state=42)

# 7Ô∏è‚É£ Train the Gradient Boosting model (without hyperparameter tuning for simplicity)
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
model.fit(X_train, y_train)

# 8Ô∏è‚É£ Predict values
y_pred = model.predict(X_test)

# 9Ô∏è‚É£ Evaluate the model's quality
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'üîπ MAE: {mae:.2f}')
print(f'üîπ RMSE: {rmse:.2f}')
print(f'üîπ R¬≤: {r2:.3f}')

# 10Ô∏è‚É£ Visualization of predicted vs. actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Ideal prediction line
plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Actual vs Predicted values (temporal features)")
plt.grid()
plt.show()

# 11Ô∏è‚É£ Visualization of feature importance
feature_importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
feature_importance.plot(kind="bar", color="royalblue")
plt.title("Importance of temporal features (Gradient Boosting)")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.grid()
plt.show()
