In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("greninja2006/boujdour")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/boujdour


In [3]:
import pandas as pd
df = pd.read_csv('/kaggle/input/boujdour/Boujdour 10T.csv', sep=";")
df.head()

Unnamed: 0,DateTime,zone1,zone2,zone3
0,14/09/2022 17:10,5981,1488,6077
1,14/09/2022 17:20,5968,1508,6052
2,14/09/2022 17:30,6045,1525,6063
3,14/09/2022 17:40,5972,1515,5929
4,14/09/2022 17:50,6075,1560,6043


In [4]:
for col in df.columns[1:]:
  df[col]=df[col].str.replace(",",".",regex=False)


In [5]:
for col in df.columns[1:]:
  df[col]=df[col].astype(float)

In [6]:
# Ensure 'DateTime' is datetime type
df['DateTime'] = pd.to_datetime(df['DateTime'], dayfirst=True, errors='coerce')

# Set DateTime as index
df = df.set_index('DateTime')

# Sort by datetime just in case
df = df.sort_index()

# Now resampling works
data_hourly = df.resample('1h').sum()
data_hourly_mean = df.resample('1h').mean()
data_daily_mean = data_hourly_mean.resample('1D').mean()


In [7]:
!pip install optuna



In [8]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip install tensorflow

Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 (from tensorflow)
  Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m319.9/319.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.0
    Uninstalling protobuf-6.33.0:
      Successfully uninstalled protobuf-6.33.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
google-cloud-translate 3.12.1

In [10]:
!pip install prophet lightgbm optuna tensorflow -q


In [11]:
# ================================================================
# üìò Robust Hybrid Feature Engineering Pipeline
# Prophet (Daily) + LSTM (Hourly) + Symbolic & Programmatic Features
# ================================================================

import numpy as np
import pandas as pd
from datetime import datetime

# ================================================================
# 1Ô∏è‚É£ LOAD AND PREPARE DATA
# ================================================================
# Assume you already have:
#  üîπ data ‚Üí original 10-min resolution dataframe
#  üîπ data_hourly_mean ‚Üí hourly mean dataframe (aggregated from data)
# Example: data_hourly_mean = data.resample('H').mean()

df_hourly = data_hourly_mean.copy()
df_hourly.index.name = "DateTime"

print(f"Raw hourly data shape: {df_hourly.shape}")
print(df_hourly.head())

# ================================================================
# 2Ô∏è‚É£ FEATURE ENGINEERING UTILITIES
# ================================================================

def add_time_features(df):
    """Add calendar and cyclical time features."""
    df = df.copy()
    df["hour"] = df.index.hour
    df["dayofweek"] = df.index.dayofweek
    df["month"] = df.index.month
    df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    return df

def add_lag_and_rolling(df, zones, lags=[1,3,6,12,24], rolls=[3,6,12,24]):
    """Add lag and rolling statistical features."""
    df = df.copy()
    for z in zones:
        for l in lags:
            df[f"{z}_lag_{l}"] = df[z].shift(l)
        for w in rolls:
            df[f"{z}_roll_mean_{w}"] = df[z].rolling(window=w, min_periods=1).mean()
            df[f"{z}_roll_std_{w}"] = df[z].rolling(window=w, min_periods=1).std().fillna(0)
    return df

def add_derivatives(df, zones):
    """Add first/second derivatives and percentage change."""
    df = df.copy()
    for z in zones:
        df[f"{z}_diff_1"] = df[z].diff(1)
        df[f"{z}_diff_2"] = df[z].diff(2)
        df[f"{z}_pct_change_1"] = df[z].pct_change(1).replace([np.inf,-np.inf], np.nan).fillna(0)
    return df

def add_fourier_terms(df, period_hours=24, K=3):
    """Add Fourier seasonal terms."""
    df = df.copy()
    t = np.arange(len(df))
    for k in range(1, K+1):
        df[f"fourier_sin_{k}"] = np.sin(2*np.pi*k*t/period_hours)
        df[f"fourier_cos_{k}"] = np.cos(2*np.pi*k*t/period_hours)
    return df

def add_symbolic_like_features(df, zones):
    """Add interpretable symbolic-like nonlinear feature combinations."""
    df = df.copy()
    for z in zones:
        df[f"{z}_sym_sinlag3_logroll6"] = np.sin(df[f"{z}_lag_3"].fillna(0)) * np.log1p(df[f"{z}_roll_mean_6"].fillna(0))
        df[f"{z}_sym_lag1_over_lag24"] = df[f"{z}_lag_1"] / (df[f"{z}_lag_24"].replace(0, np.nan))
        df[f"{z}_sym_prod_diff1_diff2"] = df[f"{z}_diff_1"].fillna(0) * df[f"{z}_diff_2"].fillna(0)
    return df

# ================================================================
# 3Ô∏è‚É£ APPLY PROGRAMMATIC + SYMBOLIC FEATURE ENGINEERING
# ================================================================
zones = [c for c in df_hourly.columns if c.startswith("zone")]
df = df_hourly.copy()
df = add_time_features(df)
df = add_lag_and_rolling(df, zones)
df = add_derivatives(df, zones)
df = add_fourier_terms(df, period_hours=24, K=2)
df = add_symbolic_like_features(df, zones)

print(f"‚úÖ After feature engineering: {df.shape[1]} columns")

# ================================================================
# 4Ô∏è‚É£ PROPHET-DERIVED DAILY FEATURES (TREND + WEEKLY + YEARLY)
# ================================================================
try:
    from prophet import Prophet
    prophet_available = True
except:
    try:
        from fbprophet import Prophet
        prophet_available = True
    except:
        prophet_available = False

if prophet_available:
    print("üß≠ Prophet detected ‚Äî extracting daily components...")
    daily = df_hourly.sum(axis=1).resample("D").mean().reset_index()
    daily.columns = ["ds", "y"]

    m = Prophet(daily_seasonality=False, weekly_seasonality=True, yearly_seasonality=True)
    m.fit(daily)
    forecast = m.predict(m.make_future_dataframe(periods=0, freq="D"))
    comp = forecast[["ds", "trend", "weekly", "yearly", "yhat"]].set_index("ds")
    comp["residual"] = daily.set_index("ds")["y"] - comp["yhat"]

    # Upsample to hourly and align with df
    comp_hourly = comp.reindex(pd.date_range(comp.index.min(), comp.index.max(), freq="H")).ffill()
    comp_hourly = comp_hourly.reindex(df.index, method="ffill")
    for col in comp_hourly.columns:
        df[f"prophet_{col}"] = comp_hourly[col].values
else:
    print("‚öôÔ∏è Prophet not available ‚Äî using STL decomposition fallback.")
    from statsmodels.tsa.seasonal import STL
    daily = df_hourly.sum(axis=1).resample("D").mean()
    stl = STL(daily.interpolate(), period=7)
    res = stl.fit()
    comp = pd.DataFrame({
        "trend": res.trend,
        "seasonal": res.seasonal,
        "resid": res.resid
    })
    comp_hourly = comp.reindex(pd.date_range(comp.index.min(), comp.index.max(), freq="H")).ffill()
    comp_hourly = comp_hourly.reindex(df.index, method="ffill")
    df["prophet_trend"] = comp_hourly["trend"].values
    df["prophet_weekly"] = comp_hourly["seasonal"].values
    df["prophet_residual"] = comp_hourly["resid"].values

# ================================================================
# 5Ô∏è‚É£ LSTM-DERIVED TEMPORAL EMBEDDINGS (OPTIONAL)
# ================================================================
try:
    import tensorflow as tf
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import LSTM, Dense, Input
    from tensorflow.keras.callbacks import EarlyStopping
    tf_available = True
except:
    tf_available = False

if tf_available:
    print("üî∂ TensorFlow available ‚Äî training LSTM encoder...")
    feature_cols = [c for c in df.columns if not c.startswith("zone")] + [f"{z}_lag_1" for z in zones]
    feature_cols = [c for c in feature_cols if c in df.columns]
    df_train = df[feature_cols].fillna(method="ffill").fillna(0)
    seq_len = 24  # one-day lookback window

    X, y = [], []
    total = df_hourly.sum(axis=1)
    for i in range(len(df_train)-seq_len):
        X.append(df_train.iloc[i:i+seq_len].values)
        y.append(total.iloc[i+seq_len])
    X, y = np.array(X), np.array(y)

    if len(X) > 0:
        inp = Input(shape=(X.shape[1], X.shape[2]))
        lstm_layer = LSTM(32, return_sequences=False, name="encoder_lstm")(inp)
        out = Dense(1, activation="linear")(lstm_layer)
        model = Model(inputs=inp, outputs=out)
        model.compile(optimizer="adam", loss="mse")
        es = EarlyStopping(monitor="loss", patience=5, restore_best_weights=True)
        model.fit(X, y, epochs=30, batch_size=32, callbacks=[es], verbose=0)

        encoder = Model(inputs=inp, outputs=model.get_layer("encoder_lstm").output)
        embeddings = encoder.predict(X, verbose=0)
        emb_df = pd.DataFrame(embeddings, index=df.index[seq_len:seq_len+len(embeddings)])
        for i_col in range(emb_df.shape[1]):
            df[f"lstm_emb_{i_col}"] = np.nan
            df.loc[emb_df.index, f"lstm_emb_{i_col}"] = emb_df.iloc[:, i_col].values
    else:
        print("Not enough samples for LSTM embedding.")
else:
    print("‚ùå TensorFlow not available ‚Äî skipping LSTM embedding features.")

# ================================================================
# 6Ô∏è‚É£ SAVE & DISPLAY FINAL FEATURE DATASET
# ================================================================
print(f"\n‚úÖ Final engineered DataFrame shape: {df.shape}")
print(f"‚úÖ Total columns: {len(df.columns)}")
print(df.head())

df.to_csv("final_engineered_df.csv")
print("üíæ Saved as final_engineered_df.csv")


Raw hourly data shape: (14816, 3)
                         zone1      zone2      zone3
DateTime                                            
2022-09-14 17:00:00  60.082000  15.192000  60.328000
2022-09-14 18:00:00  64.758333  16.280000  58.718333
2022-09-14 19:00:00  66.251667  17.761667  54.316667
2022-09-14 20:00:00  79.946667  24.691667  64.728333
2022-09-14 21:00:00  86.553333  25.910000  70.788333
‚úÖ After feature engineering: 70 columns


  df[f"{z}_pct_change_1"] = df[z].pct_change(1).replace([np.inf,-np.inf], np.nan).fillna(0)
  df[f"{z}_pct_change_1"] = df[z].pct_change(1).replace([np.inf,-np.inf], np.nan).fillna(0)
  df[f"{z}_pct_change_1"] = df[z].pct_change(1).replace([np.inf,-np.inf], np.nan).fillna(0)


üß≠ Prophet detected ‚Äî extracting daily components...


18:00:55 - cmdstanpy - INFO - Chain [1] start processing
18:00:55 - cmdstanpy - INFO - Chain [1] done processing
  comp_hourly = comp.reindex(pd.date_range(comp.index.min(), comp.index.max(), freq="H")).ffill()
2026-01-11 18:00:57.052400: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768154457.226625      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768154457.276763      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


üî∂ TensorFlow available ‚Äî training LSTM encoder...


  df_train = df[feature_cols].fillna(method="ffill").fillna(0)
I0000 00:00:1768154472.473290      47 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
I0000 00:00:1768154475.567676     175 cuda_dnn.cc:529] Loaded cuDNN version 90300



‚úÖ Final engineered DataFrame shape: (14816, 107)
‚úÖ Total columns: 107
                         zone1      zone2      zone3  hour  dayofweek  month  \
DateTime                                                                       
2022-09-14 17:00:00  60.082000  15.192000  60.328000    17          2      9   
2022-09-14 18:00:00  64.758333  16.280000  58.718333    18          2      9   
2022-09-14 19:00:00  66.251667  17.761667  54.316667    19          2      9   
2022-09-14 20:00:00  79.946667  24.691667  64.728333    20          2      9   
2022-09-14 21:00:00  86.553333  25.910000  70.788333    21          2      9   

                     is_weekend  hour_sin      hour_cos  zone1_lag_1  ...  \
DateTime                                                              ...   
2022-09-14 17:00:00           0 -0.965926 -2.588190e-01          NaN  ...   
2022-09-14 18:00:00           0 -1.000000 -1.836970e-16    60.082000  ...   
2022-09-14 19:00:00           0 -0.965926  2.588190e-01 

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


üíæ Saved as final_engineered_df.csv


In [None]:
# Robust Hybrid: Prophet + LSTM Weighted Ensemble (fixed sensitivity + improvements)
import os, random, time, warnings
import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from scipy.optimize import minimize_scalar
import shap
from scipy.stats import entropy

# -------------------- Reproducibility --------------------
os.environ['PYTHONHASHSEED'] = '42'
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
tf.get_logger().setLevel('ERROR')
# If your TF supports mixed precision and you want it, keep it. Otherwise comment out.
# tf.keras.mixed_precision.set_global_policy('mixed_float16')

warnings.filterwarnings("ignore")

# -------------------- Data (user should provide `data_hourly_mean`) --------------------
# df = data_hourly_mean.copy()  # <- expected provided by user
# Example: ensure df index is datetime and columns are zones
# df.index = pd.to_datetime(df.index)
# for col in df.columns: df[col] = df[col].interpolate().bfill().ffill().fillna(df[col].mean())

# -------------------- Parameters --------------------
train_ratio = 0.8
n_lags = 48
epochs = 100
batch_size = 32
patience = 15
alpha_cpi = 0.05  # 95% conformal interval
tscv_splits = 3   # (unused in this script, kept for later CV extension)
target_scale_for_lstm = False  # If True: scale target y_train with StandardScaler (can help numeric stability)

# -------------------- Helpers --------------------
def train_prophet(series, train_idx):
    """
    Fit Prophet on daily-aggregated training portion and predict daily yhat for whole series range.
    Then resample to hourly and align to series index by interpolation.
    """
    train_series = series.iloc[:train_idx]
    daily = train_series.resample('D').mean().reset_index()
    daily.columns = ['ds', 'y']
    daily['y'] = daily['y'].fillna(method='ffill').fillna(method='bfill').fillna(daily['y'].mean())

    m = Prophet(
        daily_seasonality=True,
        weekly_seasonality=True,
        yearly_seasonality=False,
        seasonality_mode='multiplicative',
        changepoint_prior_scale=0.9
    )
    m.fit(daily)

    # Forecast from train end date to end of series (so we have predictions over validation/test)
    start = daily['ds'].min()
    end = series.index.max().normalize()  # include until max index date
    fut = pd.DataFrame({'ds': pd.date_range(start, end, freq='D')})
    forecast = m.predict(fut)
    # use yhat, then resample to hourly and interpolate to match series index
    hourly = forecast.set_index('ds')['yhat'].resample('H').interpolate()
    # reindex to exact series index with interpolation
    hourly = hourly.reindex(pd.DatetimeIndex(series.index.union(hourly.index))).interpolate().reindex(series.index)
    return hourly

def create_lag_features(series, n_lags):
    feat = pd.DataFrame(index=series.index)
    for lag in range(1, n_lags + 1):
        feat[f'lag_{lag}'] = series.shift(lag)
    # cyclical hour encoding
    feat['hour_sin'] = np.sin(2*np.pi*series.index.hour/24)
    feat['hour_cos'] = np.cos(2*np.pi*series.index.hour/24)
    feat['dow'] = series.index.dayofweek
    feat['month'] = series.index.month
    feat['is_weekend'] = (series.index.weekday >= 5).astype(int)
    feat['trend_24h'] = series.rolling(24, min_periods=1).mean()
    feat['roll_mean_12'] = series.rolling(12, min_periods=1).mean()
    feat['roll_std_12'] = series.rolling(12, min_periods=1).std().fillna(0)
    return feat

def build_lstm(input_shape):
    model = Sequential([
        LSTM(256, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(128),
        Dense(64, activation='relu'),
        Dropout(0.1),
        Dense(32, activation='relu'),
        Dense(1, dtype='float32')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(3e-4), loss='mse')
    return model

def make_predict_fn(model):
    def predict_fn(X_2d):
        X = np.array(X_2d, dtype=np.float32)
        if X.ndim == 1:
            X = X.reshape(1, -1)
        # expected shape: (n_samples, n_features) -> reshape to (n_samples, n_features, 1)
        return model.predict(X.reshape((X.shape[0], X.shape[1], 1)), verbose=0).flatten()
    return predict_fn

def compute_residual_entropy(residuals, bins=50):
    hist, _ = np.histogram(residuals, bins=bins, density=True)
    hist = hist + 1e-12  # avoid log(0)
    return entropy(hist)

def compute_cpi(y_true, y_pred, alpha=0.05):
    # compute (1-alpha) quantile of absolute residuals
    abs_res = np.abs(y_true - y_pred)
    q = np.quantile(abs_res, 1 - alpha)
    lower = y_pred - q
    upper = y_pred + q
    coverage = np.mean((y_true >= lower) & (y_true <= upper))
    width = np.mean(upper - lower)
    return coverage, width, q

# -------------------- Main Loop --------------------
def run_hybrid(df):
    zones = df.columns.tolist()
    print("Zones:", zones, " NaNs after cleaning:", df.isna().sum().sum())
    results = []
    start_time = time.time()

    for zone in zones:
        print(f"\n--- Zone: {zone} ---")
        series = df[zone].astype(float)
        n = len(series)
        split_idx = int(n * train_ratio)

        # Prophet: train on train portion and predict for full period
        prophet_pred_series = train_prophet(series, split_idx)

        # LSTM features
        feat = create_lag_features(series, n_lags)
        feat['y_true'] = series
        supervised = feat.dropna()
        # ensure we still have enough samples
        if supervised.shape[0] < 100:
            print("Warning: too few supervised samples after lagging for zone", zone)
        train_mask = supervised.index < series.index[split_idx]
        train_df, val_df = supervised.loc[train_mask], supervised.loc[~train_mask]

        X_train = train_df.drop(columns=['y_true']).values.astype(np.float32)
        y_train = train_df['y_true'].values.astype(np.float32)
        X_val = val_df.drop(columns=['y_true']).values.astype(np.float32)
        y_val = val_df['y_true'].values.astype(np.float32)

        # scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        # optional: scale target for better numeric stability in LSTM
        if target_scale_for_lstm:
            y_scaler = StandardScaler().fit(y_train.reshape(-1,1))
            y_train_scaled = y_scaler.transform(y_train.reshape(-1,1)).reshape(-1)
            # note: at inference, remember to inverse transform predictions
        else:
            y_train_scaled = y_train.copy()

        X_train_3d = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
        X_val_3d = X_val_scaled.reshape((X_val_scaled.shape[0], X_val_scaled.shape[1], 1))

        # LSTM model training
        lstm_model = build_lstm((X_train_3d.shape[1], 1))
        es = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True, verbose=0)
        lstm_model.fit(X_train_3d, y_train_scaled, validation_data=(X_val_3d, y_val),
                       epochs=epochs, batch_size=batch_size, verbose=0, callbacks=[es])

        # Predictions from LSTM
        lstm_val_pred = lstm_model.predict(X_val_3d, verbose=0).flatten()
        if target_scale_for_lstm:
            # inverse transform
            lstm_val_pred = y_scaler.inverse_transform(lstm_val_pred.reshape(-1,1)).reshape(-1)

        # Prophet val predictions aligned to val_df index
        prophet_val_pred = prophet_pred_series.loc[val_df.index].values

        # Optimize weight w on validation set
        def obj_w(w):
            blended = w * prophet_val_pred + (1.0 - w) * lstm_val_pred
            return np.sqrt(mean_squared_error(y_val, blended))
        res = minimize_scalar(obj_w, bounds=(0.0, 1.0), method='bounded')
        w_opt = float(res.x) if res.success else 0.5
        hybrid_val_pred = w_opt * prophet_val_pred + (1.0 - w_opt) * lstm_val_pred

        # Conformal Prediction Interval (CPI)
        cpi_cov, cpi_width, cpi_q = compute_cpi(y_val, hybrid_val_pred, alpha=alpha_cpi)

        # SHAP feature importance (try real SHAP, fallback to permutation importance)
        shap_series = None
        try:
            predict_fn = make_predict_fn(lstm_model)
            bg_idx = np.random.choice(X_train_scaled.shape[0], min(100, X_train_scaled.shape[0]), replace=False)
            X_bg = X_train_scaled[bg_idx]
            explainer = shap.Explainer(predict_fn, X_bg)
            # limit to first N val rows to speed up
            shap_vals = explainer(X_val_sample, max_evals=100)
            shap_series = pd.Series(np.mean(np.abs(shap_vals.values), axis=0),
                                    index=train_df.drop(columns=['y_true']).columns).sort_values(ascending=False)
        except Exception as e:
            # fallback: simple permutation importance (correlation-based)
            try:
                cols = train_df.drop(columns=['y_true']).columns
                perm_imp = {}
                base_rmse = np.sqrt(mean_squared_error(y_val, lstm_val_pred))
                for i, col in enumerate(cols):
                    X_val_perm = X_val_scaled.copy()
                    np.random.shuffle(X_val_perm[:, i])
                    pred_perm = lstm_model.predict(X_val_perm.reshape((X_val_perm.shape[0], X_val_perm.shape[1], 1)), verbose=0).flatten()
                    if target_scale_for_lstm:
                        pred_perm = y_scaler.inverse_transform(pred_perm.reshape(-1,1)).reshape(-1)
                    rmse_perm = np.sqrt(mean_squared_error(y_val, pred_perm))
                    perm_imp[col] = rmse_perm - base_rmse
                shap_series = pd.Series(perm_imp).sort_values(ascending=False)
            except Exception:
                shap_series = pd.Series(np.abs(pd.DataFrame(X_train_scaled, columns=train_df.drop(columns=['y_true']).columns).corrwith(pd.Series(y_train))).sort_values(ascending=False))

        # Residual entropy
        res_entropy = compute_residual_entropy(y_val - hybrid_val_pred)

        # ------------ Fixed Perturbation Sensitivity ------------
        # Proper approach: perturb the validation feature matrix and compute output changes.
        eps = 1e-8
        # generate noise scaled per-feature (10% of feature std); tune scale factor if sensitivity too large
        per_feature_std = np.std(X_val_scaled, axis=0, ddof=1)
        # If a feature std is zero, use small value to avoid zero noise
        per_feature_std[per_feature_std == 0] = 1e-6
        noise_scale = 0.01  # 1% noise; you can reduce to 0.001 if this is still large
        noise = np.random.normal(0, noise_scale * per_feature_std, X_val_scaled.shape).astype(np.float32)
        # Perturb the actual validation features
        X_val_perturbed = X_val_scaled + noise
        # predict with perturbed inputs
        y_perturbed = lstm_model.predict(X_val_perturbed.reshape((X_val_perturbed.shape[0], X_val_perturbed.shape[1], 1)), verbose=0).flatten()
        if target_scale_for_lstm:
            y_perturbed = y_scaler.inverse_transform(y_perturbed.reshape(-1,1)).reshape(-1)
        # relative change metric, averaged
        sensitivity = np.mean(np.abs(y_perturbed - lstm_val_pred) / (np.abs(lstm_val_pred) + eps))

        # Metrics
        rmse_prophet = np.sqrt(mean_squared_error(y_val, prophet_val_pred))
        r2_prophet = r2_score(y_val, prophet_val_pred)
        rmse_lstm = np.sqrt(mean_squared_error(y_val, lstm_val_pred))
        r2_lstm = r2_score(y_val, lstm_val_pred)
        rmse_final = np.sqrt(mean_squared_error(y_val, hybrid_val_pred))
        r2_final = r2_score(y_val, hybrid_val_pred)

        print(f"w={w_opt:.3f} | Prophet R2={r2_prophet:.3f}, LSTM R2={r2_lstm:.3f} | Hybrid R2={r2_final:.3f}")
        print(f"CPI cov={cpi_cov:.3f}, width={cpi_width:.3f}, q={cpi_q:.4f} | Sensitivity={sensitivity:.4f} | Residual entropy={res_entropy:.4f}")

        results.append({
            'zone': zone,
            'w': w_opt,
            'rmse_prophet': rmse_prophet, 'r2_prophet': r2_prophet,
            'rmse_lstm': rmse_lstm, 'r2_lstm': r2_lstm,
            'rmse_final': rmse_final, 'r2_final': r2_final,
            'cpi_cov': cpi_cov, 'cpi_width': cpi_width,
            'cpi_q': cpi_q,
            'res_entropy': res_entropy,
            'sensitivity': sensitivity,
            'shap_series': shap_series
        })

    res_df = pd.DataFrame(results)
    print("\n=== Summary Across Zones ===")
    display_cols = ['zone','w','rmse_prophet','r2_prophet','rmse_lstm','r2_lstm','rmse_final','r2_final','cpi_cov','cpi_width','res_entropy','sensitivity']
    print(res_df[display_cols].round(4).to_string(index=False))
    print(f"\nTotal runtime: {time.time()-start_time:.1f}s")
    return res_df

# -------------------- Usage --------------------
# Ensure `data_hourly_mean` is loaded and accessible as a DataFrame
# Example: df = data_hourly_mean.copy(); df.index = pd.to_datetime(df.index)
# res_df = run_hybrid(df)