In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import time
import os
import joblib

# 1. Data Preparation
def load_and_preprocess_data(file_path="df_data_prepared.csv", time_column="date", target_column="pm2_5", feature_cols=None):
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows from {file_path}")
    df[time_column] = pd.to_datetime(df[time_column])

    # Add computed features
    df['hour'] = df[time_column].dt.hour
    df['season'] = (df[time_column].dt.month % 12 // 3).astype(int)

    # Create full hourly range
    full_time_range = pd.date_range(start=df[time_column].min(), end=df[time_column].max(), freq="h")
    df_full = df.set_index(time_column).reindex(full_time_range).reset_index().rename(columns={"index": time_column})

    if feature_cols is None:
        numeric_cols = [target_column]
    else:
        numeric_cols = [target_column] + feature_cols

    numeric_cols = [col for col in numeric_cols if col in df_full.columns]
    print(f"Using features: {numeric_cols}")

    df_cleaned = df_full[numeric_cols].dropna()
    print(f"Removed {len(df_full) - len(df_cleaned)} rows with NaN in any numeric column for training")

    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(df_cleaned)

    return df_full, data_scaled, scaler, numeric_cols

# 2. Sequence Creation
def create_seq2seq_data(data_scaled, pre_context_length, gap_length, post_context_length, feature_cols):
    """
    Create sequences for Seq2Seq model training.
    
    Parameters:
    - data_scaled: Scaled data array
    - pre_context_length: Length of pre-gap context
    - gap_length: Length of the gap to predict
    - post_context_length: Length of post-gap context
    - feature_cols: Indices of features to use
    
    Returns:
    - X_left: Pre-gap sequences
    - X_right: Post-gap sequences
    - y: Gap sequences (target)
    """
    X_left, X_right, y = [], [], []
    max_start = len(data_scaled) - pre_context_length - gap_length - post_context_length
    for i in range(max_start):
        left = data_scaled[i:i + pre_context_length, feature_cols]
        right = data_scaled[i + pre_context_length + gap_length:i + pre_context_length + gap_length + post_context_length, feature_cols]
        gap = data_scaled[i + pre_context_length:i + pre_context_length + gap_length, 0].reshape(-1, 1)  # Target (pm2_5 only)
        X_left.append(left)
        X_right.append(right)
        y.append(gap)
    return np.array(X_left), np.array(X_right), np.array(y)

# 3. Time-Based Split
def time_based_split_3(X_left, X_right, y, train_size=0.8):
    """
    Split data into training and testing sets based on time.
    
    Returns:
    - Training and testing sets for X_left, X_right, and y
    """
    n_samples = len(y)
    split_idx = int(n_samples * train_size)
    X_left_train, X_left_test = X_left[:split_idx], X_left[split_idx:]
    X_right_train, X_right_test = X_right[:split_idx], X_right[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    return X_left_train, X_left_test, X_right_train, X_right_test, y_train, y_test

# 4. Synthetic Gaps
def introduce_synthetic_gaps(df, target_col, missing_fraction, gap_length, random_state=None, max_index=None):
    df_copy = df.copy()
    n_samples = len(df_copy) if max_index is None else min(len(df_copy), max_index)
    n_gaps = int(n_samples * missing_fraction / gap_length)
    if random_state is not None:
        np.random.seed(random_state)
    gap_starts = np.random.choice(n_samples - gap_length, n_gaps, replace=False)
    gap_indices = []
    for start in gap_starts:
        df_copy.loc[start:start + gap_length - 1, target_col] = np.nan
        gap_indices.extend(range(start, start + gap_length))
    return df_copy, gap_indices, gap_starts

# 5. Model Evaluation
def evaluate_model(y_true, y_pred):
    """
    Evaluate model performance using multiple metrics.
    
    Returns:
    - MAE, RMSE, R², MAPE
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100
    return mae, rmse, r2, mape

# 6. Training the Model
def create_and_train_dynamic_seq2seq_xgb(X_left_train, y_train, X_right_train, pre_context_length, gap_lengths, feature_cols):
    """
    Train a dynamic Seq2Seq model using XGBRegressor.
    
    Parameters:
    - X_left_train, X_right_train, y_train: Training data per gap length
    - pre_context_length: Pre-gap context length
    - gap_lengths: List of gap lengths to train on
    - feature_cols: Feature indices
    
    Returns:
    - Trained MultiOutputRegressor model
    """
    C_max = 32
    max_gap_length = max(gap_lengths)
    n_features = len(feature_cols)

    def prepare_data(X_left, X_right, y, gap_length):
        C_dynamic = min(gap_length * 3 if gap_length <= 10 else 32, pre_context_length)
        X_combined = []
        y_padded_list = []
        for i in range(len(X_left)):
            left = X_left[i][-C_dynamic:, feature_cols]
            right = X_right[i][:C_dynamic, feature_cols]
            left_padded = np.pad(left, ((C_max - C_dynamic, 0), (0, 0)), mode='constant', constant_values=0)
            right_padded = np.pad(right, ((0, C_max - C_dynamic), (0, 0)), mode='constant', constant_values=0)
            left_flat = left_padded.flatten()
            right_flat = right_padded.flatten()
            metadata = np.array([gap_length, C_dynamic, i % gap_length])
            combined = np.concatenate([left_flat, right_flat, metadata])
            y_current = y[i].ravel()
            if np.any(np.isnan(y_current)):
                continue
            y_padded = np.pad(y_current, (0, max_gap_length - len(y_current)), mode='constant', constant_values=0)
            X_combined.append(combined)
            y_padded_list.append(y_padded)
        if not X_combined or not y_padded_list:
            raise ValueError("No valid data after filtering NaN values")
        return np.array(X_combined), np.array(y_padded_list)

    X_train_combined = []
    y_train_padded = []
    for gap_length in gap_lengths:
        X_temp, y_temp = prepare_data(X_left_train[gap_length], X_right_train[gap_length], y_train[gap_length], gap_length)
        X_train_combined.append(X_temp)
        y_train_padded.append(y_temp)

    X_train_combined = np.vstack(X_train_combined)
    y_train_padded = np.vstack(y_train_padded)

    model = MultiOutputRegressor(XGBRegressor(n_estimators=50, random_state=42))
    model.fit(X_train_combined, y_train_padded)
    return model

# 7. Forecasting
def direct_dynamic_seq2seq_forecast_xgb(model, initial_context, gap_length, pre_context_length, post_context_length, feature_cols):
    """
    Forecast a gap using the trained model.
    
    Parameters:
    - model: Trained MultiOutputRegressor model
    - initial_context: Context data around the gap (scaled)
    - gap_length: Length of the gap to predict
    - pre_context_length: Length of context before the gap
    - post_context_length: Length of context after the gap
    - feature_cols: List of feature indices to use (None for univariate)
    
    Returns:
    - Predicted gap values (scaled)
    """
    C_dynamic = min(gap_length * 3 if gap_length <= 10 else 32, pre_context_length)
    C_max = 32
    
    # Determine the number of features
    if feature_cols is None:
        # For univariate, use all columns in initial_context (should be just pm2_5)
        n_features = initial_context.shape[1]
        left_context = initial_context[:pre_context_length][-C_dynamic:]
        right_context = initial_context[-post_context_length:][:C_dynamic]
    else:
        # For multivariate, use the specified feature indices
        n_features = len(feature_cols)
        left_context = initial_context[:pre_context_length, feature_cols][-C_dynamic:]
        right_context = initial_context[-post_context_length:, feature_cols][:C_dynamic]

    # Pad the contexts if necessary
    left_padded = np.pad(left_context, ((C_max - C_dynamic, 0), (0, 0)), mode='constant', constant_values=0)
    right_padded = np.pad(right_context, ((0, C_max - C_dynamic), (0, 0)), mode='constant', constant_values=0)
    left_flat = left_padded.flatten()
    right_flat = right_padded.flatten()
    metadata = np.array([gap_length, C_dynamic, 0])
    input_flat = np.concatenate([left_flat, right_flat, metadata]).reshape(1, -1)

    # Make prediction
    prediction = model.predict(input_flat)
    return prediction[0, :gap_length].reshape(-1, 1)

# 8. Testing Synthetic Gaps
def test_synthetic_gaps(model, df, data_scaled, gap_indices, gap_starts, gap_length, scaler, pre_context_length, post_context_length, feature_cols):
    all_true = []
    all_pred = []
    max_index = data_scaled.shape[0]
    
    for start in gap_starts:
        # Skip gaps that are out of bounds for data_scaled
        if start + gap_length > max_index:
            continue
        
        end = start + gap_length
        context_start = max(0, start - pre_context_length)
        context_end = min(max_index, end + post_context_length)
        context = data_scaled[context_start:context_end, feature_cols]
        
        required_length = pre_context_length + post_context_length
        if len(context) < required_length:
            if len(context) == 0:
                context = np.zeros((required_length, len(feature_cols)))
            else:
                context = np.pad(context, ((0, required_length - len(context)), (0, 0)), mode='constant', constant_values=0)
        
        pred = direct_dynamic_seq2seq_forecast_xgb(model, context, gap_length, pre_context_length, post_context_length, feature_cols)
        true = data_scaled[start:end, 0].reshape(-1, 1)
        
        # Skip if true contains NaN values
        if np.any(np.isnan(true)):
            continue
        
        all_true.append(true)
        all_pred.append(pred)
    
    if not all_true or not all_pred:
        raise ValueError("No valid gaps to evaluate after filtering.")
    
    all_true = np.concatenate(all_true)
    all_pred = np.concatenate(all_pred)
    
    full_true_array = np.full((len(all_true), len(feature_cols)), np.nan)
    full_pred_array = np.full((len(all_pred), len(feature_cols)), np.nan)
    full_true_array[:, 0] = all_true.ravel()
    full_pred_array[:, 0] = all_pred.ravel()
    
    all_true_unscaled = scaler.inverse_transform(full_true_array)
    all_pred_unscaled = scaler.inverse_transform(full_pred_array)
    
    return all_true_unscaled[:, 0].ravel(), all_pred_unscaled[:, 0].ravel()

# 9. Tester Class
class DynamicSeq2SeqXGBTester:
    def __init__(self, df, data_scaled, scaler, feature_cols, pre_context_length=32, post_context_length=32):
        """Initialize the tester with data and parameters."""
        self.df = df
        self.data_scaled = data_scaled
        self.scaler = scaler
        self.feature_cols = feature_cols
        self.pre_context_length = pre_context_length
        self.post_context_length = post_context_length
        self.results = {}
        self.predictions = {}

    def prepare_data(self, gap_lengths):
        """Prepare data for each gap length."""
        data_per_gap = {}
        for gap_length in gap_lengths:
            X_left, X_right, y = create_seq2seq_data(
                self.data_scaled, self.pre_context_length, gap_length, self.post_context_length, self.feature_cols
            )
            X_left_train, X_left_test, X_right_train, X_right_test, y_train, y_test = time_based_split_3(X_left, X_right, y)
            data_per_gap[gap_length] = (X_left_train, X_right_train, y_train, X_left_test, X_right_test, y_test)
        return data_per_gap

    def run_tests(self, gap_lengths=[5, 12, 24, 48, 72], n_runs=10, missing_fraction=0.05, model_save_path="dynamic_seq2seq_xgb_model.joblib"):
        data_per_gap = self.prepare_data(gap_lengths)

        print("Training DynamicSeq2SeqXGB on all gap lengths...")
        start_time = time.time()
        X_left_train = {gl: data_per_gap[gl][0] for gl in gap_lengths}
        X_right_train = {gl: data_per_gap[gl][1] for gl in gap_lengths}
        y_train = {gl: data_per_gap[gl][2] for gl in gap_lengths}
        model = create_and_train_dynamic_seq2seq_xgb(X_left_train, y_train, X_right_train, self.pre_context_length, gap_lengths, self.feature_cols)
        training_time = time.time() - start_time
        print(f"Training completed in {training_time:.2f} seconds")

        joblib.dump(model, model_save_path)
        print(f"Model saved to {model_save_path}")

        for gap_length in gap_lengths:
            self.results[gap_length] = {}
            self.predictions[gap_length] = {}
            print(f"Testing on gap length {gap_length}...")
            metrics_runs = []
            all_true_runs = []
            all_pred_runs = []
            run_times = []

            for run in range(n_runs):
                run_start_time = time.time()
                df_missing, gap_indices, gap_starts = introduce_synthetic_gaps(
                    self.df, "pm2_5", missing_fraction, gap_length, random_state=run, max_index=len(self.data_scaled)
                )
                all_true, all_pred = test_synthetic_gaps(
                    model, self.df, self.data_scaled, gap_indices, gap_starts, gap_length, self.scaler,
                    self.pre_context_length, self.post_context_length, self.feature_cols
                )
                metrics = evaluate_model(all_true, all_pred)
                metrics_runs.append(metrics)
                all_true_runs.append(all_true)
                all_pred_runs.append(all_pred)
                run_times.append(time.time() - run_start_time)

            metrics_array = np.array(metrics_runs)
            self.results[gap_length] = {
                "MAE": {"mean": np.mean(metrics_array[:, 0]), "std": np.std(metrics_array[:, 0])},
                "RMSE": {"mean": np.mean(metrics_array[:, 1]), "std": np.std(metrics_array[:, 1])},
                "R2": {"mean": np.mean(metrics_array[:, 2]), "std": np.std(metrics_array[:, 2])},
                "MAPE": {"mean": np.mean(metrics_array[:, 3]), "std": np.std(metrics_array[:, 3])},
                "run_times": run_times,
                "total_time": sum(run_times)
            }
            self.predictions[gap_length] = {
                "all_true": np.concatenate(all_true_runs),
                "all_pred": np.concatenate(all_pred_runs)
            }
            print(f"Finished testing on gap length {gap_length} in {sum(run_times):.2f} seconds")

        return model

    def summarize_results(self, visualize=True, output_dir="plots_dynamic"):
        """Summarize and optionally visualize results."""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for gap_length in self.results:
            print(f"\nResults for gap length {gap_length}:")
            for metric in ["MAE", "RMSE", "R2", "MAPE"]:
                mean = self.results[gap_length][metric]["mean"]
                std = self.results[gap_length][metric]["std"]
                print(f"{metric}: {mean:.4f} ± {std:.4f}")
            print(f"Total time: {self.results[gap_length]['total_time']:.2f} seconds")

            if visualize:
                all_true = self.predictions[gap_length]["all_true"]
                all_pred = self.predictions[gap_length]["all_pred"]
                plt.figure(figsize=(10, 6))
                plt.scatter(all_true, all_pred, alpha=0.5)
                min_val, max_val = min(all_true.min(), all_pred.min()), max(all_true.max(), all_pred.max())
                plt.plot([min_val, max_val], [min_val, max_val], 'r--')
                plt.xlabel("True Values (µg/m³)")
                plt.ylabel("Predicted Values (µg/m³)")
                plt.title(f"DynamicSeq2SeqXGB: True vs Predicted (Gap Length = {gap_length})")
                plt.savefig(os.path.join(output_dir, f"scatter_gap_{gap_length}.png"), dpi=600)
                plt.close()

# 10. Filling Real Gaps
def fill_real_gaps(df, model_path="dynamic_seq2seq_xgb_model.joblib", scaler=None, pre_context_length=32, post_context_length=32, column="pm2_5", max_gap_length=72, feature_names=None, feature_indices=None):
    """
    Fill real gaps in the data using the trained model.
    
    Parameters:
    - df: DataFrame with missing values
    - model_path: Path to the trained model
    - scaler: Fitted StandardScaler object
    - pre_context_length: Length of context before the gap
    - post_context_length: Length of context after the gap
    - column: Target column to fill (e.g., 'pm2_5')
    - max_gap_length: Maximum gap length the model can handle
    - feature_names: List of feature column names for df (None for univariate)
    - feature_indices: List of feature indices for data_scaled (None for univariate)
    
    Returns:
    - DataFrame with filled gaps
    """
    model = joblib.load(model_path)
    print(f"Model loaded from {model_path}")

    df_filled = df.copy()
    df_filled['hour'] = df_filled['date'].dt.hour
    df_filled['season'] = (df_filled['date'].dt.month % 12 // 3).astype(int)
    numeric_cols = ['pm2_5'] if feature_names is None else ['pm2_5'] + feature_names
    data_scaled = scaler.transform(df_filled[numeric_cols])

    nan_indices = df.index[df[column].isna()].tolist()
    if not nan_indices:
        print("No missing values found in the data.")
        return df_filled

    gap_starts = []
    gap_lengths = []
    current_start = nan_indices[0]
    current_length = 1

    for i in range(1, len(nan_indices)):
        if nan_indices[i] == nan_indices[i-1] + 1:
            current_length += 1
        else:
            gap_starts.append(current_start)
            gap_lengths.append(current_length)
            current_start = nan_indices[i]
            current_length = 1
    gap_starts.append(current_start)
    gap_lengths.append(current_length)

    # Determine the number of features based on numeric_cols
    n_features = len(numeric_cols)

    for start, gap_length in zip(gap_starts, gap_lengths):
        position_start = df.index.get_loc(start)
        print(f"Processing gap at index {start} with length {gap_length}")

        if gap_length > max_gap_length:
            num_full_chunks = gap_length // max_gap_length
            remainder = gap_length % max_gap_length
            all_pred_unscaled = []

            for chunk in range(num_full_chunks):
                chunk_start_idx = position_start + chunk * max_gap_length
                context_start = max(0, chunk_start_idx - pre_context_length)
                context_end = min(len(data_scaled), chunk_start_idx + max_gap_length + post_context_length)
                context = data_scaled[context_start:context_end]
                if feature_indices is not None:
                    context = context[:, feature_indices]
                if len(context) < pre_context_length + post_context_length:
                    if len(context) == 0:
                        context = np.zeros((pre_context_length + post_context_length, n_features))
                    else:
                        context = np.pad(context, ((0, pre_context_length + post_context_length - len(context)), (0, 0)), mode='constant', constant_values=0)
                pred_scaled = direct_dynamic_seq2seq_forecast_xgb(model, context, max_gap_length, pre_context_length, post_context_length, feature_indices)
                full_pred_array = np.full((len(pred_scaled), n_features), np.nan)
                full_pred_array[:, 0] = pred_scaled.ravel()
                pred_unscaled = scaler.inverse_transform(full_pred_array)
                all_pred_unscaled.append(pred_unscaled[:, 0].ravel())

            if remainder > 0:
                chunk_start_idx = position_start + num_full_chunks * max_gap_length
                context_start = max(0, chunk_start_idx - pre_context_length)
                context_end = min(len(data_scaled), chunk_start_idx + remainder + post_context_length)
                context = data_scaled[context_start:context_end]
                if feature_indices is not None:
                    context = context[:, feature_indices]
                if len(context) < pre_context_length + post_context_length:
                    if len(context) == 0:
                        context = np.zeros((pre_context_length + post_context_length, n_features))
                    else:
                        context = np.pad(context, ((0, pre_context_length + post_context_length - len(context)), (0, 0)), mode='constant', constant_values=0)
                pred_scaled = direct_dynamic_seq2seq_forecast_xgb(model, context, remainder, pre_context_length, post_context_length, feature_indices)
                full_pred_array = np.full((len(pred_scaled), n_features), np.nan)
                full_pred_array[:, 0] = pred_scaled.ravel()
                pred_unscaled = scaler.inverse_transform(full_pred_array)
                all_pred_unscaled.append(pred_unscaled[:, 0].ravel())

            pred_unscaled_full = np.concatenate(all_pred_unscaled)
        else:
            context_start = max(0, position_start - pre_context_length)
            context_end = min(len(data_scaled), position_start + gap_length + post_context_length)
            context = data_scaled[context_start:context_end]
            if feature_indices is not None:
                context = context[:, feature_indices]
            if len(context) < pre_context_length + post_context_length:
                if len(context) == 0:
                    context = np.zeros((pre_context_length + post_context_length, n_features))
                else:
                    context = np.pad(context, ((0, pre_context_length + post_context_length - len(context)), (0, 0)), mode='constant', constant_values=0)
            pred_scaled = direct_dynamic_seq2seq_forecast_xgb(model, context, gap_length, pre_context_length, post_context_length, feature_indices)
            full_pred_array = np.full((len(pred_scaled), n_features), np.nan)
            full_pred_array[:, 0] = pred_scaled.ravel()
            pred_unscaled = scaler.inverse_transform(full_pred_array)
            pred_unscaled_full = pred_unscaled[:, 0].ravel()

        gap_indices = df.index[position_start:position_start + gap_length]
        df_filled.loc[gap_indices, column] = pred_unscaled_full

    print(f"Filled {len(nan_indices)} missing values in {len(gap_starts)} gaps.")
    return df_filled



In [None]:
# 11. Main Execution
if __name__ == "__main__":
    # Define feature columns for multivariate model
    multi_feature_cols = ['Ff', 'DD', 'air_temperature', 'air_humidity', 'hour', 'season']

    # Univariate model
    print("\n=== Univariate Model ===")
    df_full_uni, data_scaled_uni, scaler_uni, numeric_cols_uni = load_and_preprocess_data("df_data_prepared.csv", feature_cols=None)
    tester_uni = DynamicSeq2SeqXGBTester(df_full_uni, data_scaled_uni, scaler_uni, feature_cols=[0])  # feature_cols as indices for data_scaled
    model_uni = tester_uni.run_tests(gap_lengths=[5, 12, 24, 48, 72], n_runs=10, model_save_path="dynamic_uniseq2seq_xgb_model.joblib")
    tester_uni.summarize_results(visualize=True, output_dir="plots_dynamic_uni")

    # Multivariate model
    print("\n=== Multivariate Model ===")
    df_full_multi, data_scaled_multi, scaler_multi, numeric_cols_multi = load_and_preprocess_data("df_data_prepared.csv", feature_cols=multi_feature_cols)
    feature_indices = list(range(len(numeric_cols_multi)))  # Indices for data_scaled
    tester_multi = DynamicSeq2SeqXGBTester(df_full_multi, data_scaled_multi, scaler_multi, feature_cols=feature_indices)
    model_multi = tester_multi.run_tests(gap_lengths=[5, 12, 24, 48, 72], n_runs=10, model_save_path="dynamic_multiseq2seq_xgb_model.joblib")
    tester_multi.summarize_results(visualize=True, output_dir="plots_dynamic_multi")

    # Filling real gaps
    df_with_nans = pd.read_csv("df_data_prepared.csv")
    df_with_nans["date"] = pd.to_datetime(df_with_nans["date"])
    full_time_range = pd.date_range(start=df_with_nans["date"].min(), end=df_with_nans["date"].max(), freq="h")
    df_with_nans_full = df_with_nans.set_index("date").reindex(full_time_range).reset_index().rename(columns={"index": "date"})

    # Univariate gap filling
    print("\n=== Filling Real Gaps (Univariate) ===")
    df_filled_uni = fill_real_gaps(
        df_with_nans_full,
        model_path="dynamic_uniseq2seq_xgb_model.joblib",
        scaler=scaler_uni,
        feature_names=None,  # No additional feature names for univariate
        feature_indices=None  # No feature indices for univariate
    )
    df_filled_uni.to_csv("df_data_filled_uni.csv", index=False)
    print("Univariate filled data saved to 'df_data_filled_uni.csv'")

    # Multivariate gap filling
    print("\n=== Filling Real Gaps (Multivariate) ===")
    df_filled_multi = fill_real_gaps(
        df_with_nans_full,
        model_path="dynamic_multiseq2seq_xgb_model.joblib",
        scaler=scaler_multi,
        feature_names=multi_feature_cols,  # Column names for df_filled
        feature_indices=feature_indices  # Indices for data_scaled
    )
    df_filled_multi.to_csv("df_data_filled_multi.csv", index=False)
    print("Multivariate filled data saved to 'df_data_filled_multi.csv'")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

# MAE data from the results table
gap_lengths = [5, 12, 24, 48, 72]
uni_mae = [8.287, 7.978, 8.790, 9.676, 11.651]
multi_mae = [7.665, 7.253, 7.868, 8.025, 10.772]
uni_mae_std = [1.549, 2.015, 1.844, 4.557, 4.611]
multi_mae_std = [1.521, 1.800, 2.677, 3.708, 6.214]

# Calculate percentage reduction
reduction = [(uni - multi) / uni * 100 for uni, multi in zip(uni_mae, multi_mae)]

# Create the plot
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.2
index = np.arange(len(gap_lengths)) * 1.1  # Increased interval between groups

# Create bar groups
bars1 = ax.bar([i - 0.15 for i in index], uni_mae, bar_width, label='Dynamic Univariate', 
               color='steelblue', yerr=uni_mae_std, capsize=3, ecolor="gray")
bars2 = ax.bar([i + 0.15 for i in index], multi_mae, bar_width, label='Dynamic Multivariate', 
               color='forestgreen', yerr=multi_mae_std, capsize=3, ecolor="gray")

# Add MAE reduction percentages
z = [0, 1.1, 2.2, 3.3, 4.4]  # Adjusted positions for the text
for i, pct in enumerate(reduction):
    ax.text(z[i], max(uni_mae[i], multi_mae[i]) + 1, f'▼ {pct:.1f}%', 
            ha='center', va='bottom')

# Plot formatting
ax.set_xlabel('Gap Length (hours)', fontsize=12)
ax.set_ylabel('MAE (µg/m³)', fontsize=12)
ax.set_xticks(index)
ax.set_xticklabels(gap_lengths)
ax.legend(loc='upper left')

# Grid for better readability
ax.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig(os.path.join("output_diagrams", 'dynamic_models_comparison.png'), dpi=600)
plt.show()