<a href="https://colab.research.google.com/github/sudo-Oliver/Predictive-Analytics-Private/blob/main/notebooks/LSTM%20Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Daten laden und vorbereiten**
1. Laden der Daten in einen Dataframe
2. Zeitspalte umwandeln (Unix-Timestamp -> Datetime)
3. nach homeid gruppieren (jeder Haushalt hat seine eigene Zeitreihe)
4. Sortieren nach Zeit innerhalb des Haushalts

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
import gdown
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

In [2]:
# Verify GPU availability
print("TensorFlow version:", tf.__version__)
print("Metal plugin available:", tf.config.list_physical_devices('GPU'))

# Configure memory growth
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
    print("Metal GPU will be used")
else:
    print("Running on CPU")

TensorFlow version: 2.16.2
Metal plugin available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Metal GPU will be used


In [3]:
def clean_data(df):
    """Clean and preprocess sensor data"""
    # Convert Unix timestamp to datetime
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local'], unit='ms')

    # Set timestamp_local as index
    df.set_index('timestamp_local', inplace=True)

    # Sort by homeid and timestamp_local
    df = df.sort_values(by=['homeid', 'timestamp_local'])

    # Remove specified columns
    columns_to_drop = [
        'sensorid', 'median_temperature', '_room',
        'sensorid_room', 'measured_entity',
        'sensorid_electric', 'sensorid_gas'
    ]
    df = df.drop(columns=columns_to_drop)

    return df

In [4]:
def load_processed_data():
    """Load preprocessed sensor data with fallback to Drive download"""
    file_id = "1KHQCVfwTxm5bjjITS8WMm9P3M12ETVsR"

    download_path = Path('data/processed')
    download_path.mkdir(parents=True, exist_ok=True)
    file_path = download_path / 'final_processed_data3.parquet'

    if not file_path.exists():
        print("Downloading from Google Drive...")
        url = f"https://drive.google.com/uc?id={file_id}"
        gdown.download(url, str(file_path), quiet=False)

    if file_path.exists():
        df = pd.read_parquet(file_path)
        print(f"Data loaded successfully: {df.shape} rows")
        return df
    else:
        raise FileNotFoundError("Could not load or download data file")

# Load and clean data
df = load_processed_data()
df_clean = clean_data(df.copy())

df_clean.head()

Data loaded successfully: (1641653, 23) rows


Unnamed: 0_level_0,homeid,electric_min_consumption,electric_max_consumption,std_consumption,electric_median_consumption,electric_total_consumption_Wh,gas_mean_consumption,gas_min_consumption,gas_max_consumption,gas_median_consumption,gas_total_consumption_Wh,median_value,roomid,income_band_mid,education_map
timestamp_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-09-20 09:00:00,47,0.069,0.335,0.033905,0.194,0.179807,0.112,0.112,0.112,0.112,0.224,20.72,652.0,0.0,8.0
2016-09-20 10:00:00,47,0.068875,0.458375,0.035875,0.187625,0.17669,0.112,0.112,0.112,0.112,0.21,20.695,652.0,0.0,8.0
2016-09-20 11:00:00,47,0.06875,0.58175,0.037846,0.18125,0.173574,0.112,0.112,0.112,0.112,0.196,20.67,652.0,0.0,8.0
2016-09-20 12:00:00,47,0.068625,0.705125,0.039817,0.174875,0.170457,0.112,0.112,0.112,0.112,0.182,20.645,652.0,0.0,8.0
2016-09-20 13:00:00,47,0.0685,0.8285,0.041788,0.1685,0.16734,0.112,0.112,0.112,0.112,0.168,20.62,652.0,0.0,8.0


**2. Feature Engineering & Datenbereinigung**
1. Zyklische Transformation für Zeitdaten (hour_sin, hour_cos für Stunden)
2. Lag-Features erstellen (für vorherige Strom und Gaswerte)
3. Rolling-Average-Features (z.B gleitender Mittelwert über 3 oder 7 Zeitschritte)
4. Daten normalisieren (Min-Max-Scaling für LSTM)

In [5]:
# Vollständige Korrelation mit allen spalten berechnen
correlation_matrix_all = df_clean.corr()

# Korrelation der Features mit den Zielvariablen (Strom und Gasverbtauch)
correlation_target_all = correlation_matrix_all[['electric_total_consumption_Wh', 'gas_total_consumption_Wh']]

# Sortieren nach Stärke der Korrelation
correlation_target_all_sorted = correlation_target_all.abs().sort_values(by=['electric_total_consumption_Wh', 'gas_total_consumption_Wh'], ascending=False)

# Korrelationsergebnisse anzeigen
print("Full Feature Correlation:")
display(correlation_target_all_sorted)

Full Feature Correlation:


Unnamed: 0,electric_total_consumption_Wh,gas_total_consumption_Wh
electric_total_consumption_Wh,1.0,0.032214
electric_median_consumption,0.803903,0.025043
std_consumption,0.775032,0.024055
electric_max_consumption,0.693017,0.042803
electric_min_consumption,0.516674,0.039677
income_band_mid,0.154421,0.034544
median_value,0.066262,0.003073
education_map,0.054028,0.012451
gas_total_consumption_Wh,0.032214,1.0
gas_max_consumption,0.028237,0.999275


In [6]:
# Extract hour from timestamp index
df_clean['hour'] = df_clean.index.hour

# Create cyclical features
df_clean['hour_sin'] = np.sin(2 * np.pi * df_clean['hour']/24)
df_clean['hour_cos'] = np.cos(2 * np.pi * df_clean['hour']/24)

# Create lag features for electric consumption (t-1, t-2, t-3)
for lag in range(1, 4):
    df_clean[f'electric_lag_{lag}'] = df_clean.groupby('homeid')['electric_total_consumption_Wh'].shift(lag)
    # Create lag features for gas consumption (t-1, t-2, t-3)
    df_clean[f'gas_lag_{lag}'] = df_clean.groupby('homeid')['gas_total_consumption_Wh'].shift(lag)

# Create rolling means for electric consumption (3 and 7 time steps)
df_clean['electric_rolling_mean_3h'] = df_clean.groupby('homeid')['electric_total_consumption_Wh'].rolling(window=3).mean().reset_index(0, drop=True)
df_clean['electric_rolling_mean_7h'] = df_clean.groupby('homeid')['electric_total_consumption_Wh'].rolling(window=7).mean().reset_index(0, drop=True)

# Create rolling means for gas consumption (3 and 7 time steps)
df_clean['gas_rolling_mean_3h'] = df_clean.groupby('homeid')['gas_total_consumption_Wh'].rolling(window=3).mean().reset_index(0, drop=True)
df_clean['gas_rolling_mean_7h'] = df_clean.groupby('homeid')['gas_total_consumption_Wh'].rolling(window=7).mean().reset_index(0, drop=True)

# Replace deprecated fillna methods with new syntax
df_clean = df_clean.ffill()  # Forward fill
df_clean = df_clean.bfill()  # Backward fill

# Define features to scale
scaled_features = ['electric_total_consumption_Wh', 'gas_total_consumption_Wh', 'electric_median_consumption', 'electric_max_consumption', 'electric_min_consumption', 'std_consumption', 'gas_max_consumption', 'gas_min_consumption', 'gas_median_consumption', 'median_value', 'hour_sin', 'hour_cos', 'electric_lag_1', 'electric_lag_2', 'electric_lag_3', 'gas_lag_1', 'gas_lag_2', 'gas_lag_3', 'electric_rolling_mean_3h', 'electric_rolling_mean_7h', 'gas_rolling_mean_3h', 'gas_rolling_mean_7h']

# Initialize scaler
scaler = MinMaxScaler()

# Fit and transform the selected features
df_clean[scaled_features] = scaler.fit_transform(df_clean[scaled_features])

#df_clean.to_parquet('lstm_preprocessed_data.parquet')

**3. Trainings und Testdatensätze erstellen**
1. Daten für jeden Haushalt in eine geeignetes Format bringen
2. Train-Test-Split: 80% Training 20% Test
3. Zeitfenster für LSTM definieren (z.B 24 Stunden zurückblicken um die nächste Stunde vorherzusagen)

In [None]:
# Define parameters and columns
time_steps = 90

# Features and target definition (Strom und Gas)
feature_columns = [col for col in df_clean.columns if col not in ['electric_total_consumption_Wh', 'gas_total_consumption_Wh', 'homeid', 'roomid']]
target_column_electric = 'electric_total_consumption_Wh'
target_column_gas = 'gas_total_consumption_Wh'

def create_memmap_array(shape, filename, dtype='float32'):
    """Create memory-mapped array"""
    path = Path('temp_arrays')
    path.mkdir(exist_ok=True)
    return np.memmap(path / filename, dtype=dtype, mode='w+', shape=shape)

def process_data_efficiently(df_clean, target_column, feature_columns, time_steps, prefix):
    """Process data with disk-based storage"""
    total_sequences = len(df_clean) - time_steps
    n_features = len(feature_columns)
    
    # Create memory-mapped arrays
    X = create_memmap_array((total_sequences, time_steps, n_features), f'{prefix}_X.mmap')
    y = create_memmap_array((total_sequences,), f'{prefix}_y.mmap')
    
    # Process in smaller chunks
    chunk_size = 500
    feature_data = df_clean[feature_columns].values
    target_data = df_clean[target_column].values
    
    print(f"Processing {prefix} data...")
    for chunk_start in range(0, total_sequences, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_sequences)
        
        for i in range(chunk_start, chunk_end):
            X[i] = feature_data[i:i + time_steps]
            y[i] = target_data[i + time_steps]
            
        if chunk_start % (chunk_size * 10) == 0:
            print(f"Progress: {chunk_start/total_sequences*100:.1f}%")
    
    return X, y

# Process both electric and gas data
time_steps = 90
feature_columns = [col for col in df_clean.columns if col not in 
                  ['electric_total_consumption_Wh', 'gas_total_consumption_Wh', 'homeid', 'roomid']]

# Process electric data
X_electric, y_electric = process_data_efficiently(
    df_clean, 
    'electric_total_consumption_Wh',
    feature_columns,
    time_steps,
    'electric'
)

# Process gas data
X_gas, y_gas = process_data_efficiently(
    df_clean,
    'gas_total_consumption_Wh',
    feature_columns,
    time_steps,
    'gas'
)

# Split datasets
X_train_electric, X_test_electric, y_train_electric, y_test_electric = train_test_split(
    X_electric, y_electric, test_size=0.2, shuffle=False
)

X_train_gas, X_test_gas, y_train_gas, y_test_gas = train_test_split(
    X_gas, y_gas, test_size=0.2, shuffle=False
)


# Show shapes of the data
train_test_summary = {
    'X_train_electric': X_train_electric.shape,
    'X_test_electric': X_test_electric.shape,
    'X_train_gas': X_train_gas.shape,
    'X_test_gas': X_test_gas.shape,
}
train_test_summary




Processing electric data...
Progress: 0.0%
Progress: 0.3%
Progress: 0.6%
Progress: 0.9%
Progress: 1.2%
Progress: 1.5%
Progress: 1.8%
Progress: 2.1%
Progress: 2.4%
Progress: 2.7%
Progress: 3.0%
Progress: 3.4%
Progress: 3.7%
Progress: 4.0%
Progress: 4.3%
Progress: 4.6%
Progress: 4.9%
Progress: 5.2%
Progress: 5.5%
Progress: 5.8%
Progress: 6.1%
Progress: 6.4%
Progress: 6.7%
Progress: 7.0%
Progress: 7.3%
Progress: 7.6%
Progress: 7.9%
Progress: 8.2%
Progress: 8.5%
Progress: 8.8%
Progress: 9.1%
Progress: 9.4%
Progress: 9.7%
Progress: 10.1%
Progress: 10.4%
Progress: 10.7%
Progress: 11.0%
Progress: 11.3%
Progress: 11.6%
Progress: 11.9%
Progress: 12.2%
Progress: 12.5%
Progress: 12.8%
Progress: 13.1%
Progress: 13.4%
Progress: 13.7%
Progress: 14.0%
Progress: 14.3%
Progress: 14.6%
Progress: 14.9%
Progress: 15.2%
Progress: 15.5%
Progress: 15.8%
Progress: 16.1%
Progress: 16.4%
Progress: 16.8%
Progress: 17.1%
Progress: 17.4%
Progress: 17.7%
Progress: 18.0%
Progress: 18.3%
Progress: 18.6%
Progress: 18.

**4. LSTM Modell erstellen**
1. Daten in das LSTM Format bringen (X_train, y_train)
2. LSTM schichten definieren (Tensorflow)
3. Modell kompilieren und trainieren
4. Hyperparameter-Tuning (z.B Anzahl Neuronen, Learning Rate,...)

In [None]:
# # Configure device
# def get_device():
#     """Get available device (GPU or CPU)"""
#     if tf.config.list_physical_devices('GPU'):
#         return '/GPU:0'
#     return '/CPU:0'

# # Configure training environment
# def setup_training_env():
#     """Setup TensorFlow training environment"""
#     tf.keras.backend.clear_session()

#     # Configure GPU if available
#     physical_devices = tf.config.list_physical_devices('GPU')
#     if physical_devices:
#         try:
#             for device in physical_devices:
#                 tf.config.experimental.set_memory_growth(device, True)
#         except:
#             print("Memory growth not supported")
#     else:
#         print("Using CPU for training")

# def train_with_batches(X, y, model_name, batch_size=8):
#     """Train LSTM model with device optimization"""
#     device = get_device()

#     with tf.device(device):
#         # Build model with distribution strategy
#         strategy = tf.distribute.MirroredStrategy() if device == '/GPU:0' else tf.distribute.OneDeviceStrategy(device)

#         with strategy.scope():
#             model = tf.keras.Sequential([
#                 tf.keras.layers.LSTM(32, return_sequences=True, input_shape=(90, 24)),
#                 tf.keras.layers.Dropout(0.2),
#                 tf.keras.layers.LSTM(16),
#                 tf.keras.layers.Dense(1)
#             ])
#             model.compile(optimizer='adam', loss='mse')

#         # Create optimized dataset
#         dataset = tf.data.Dataset.from_tensor_slices((X, y))\
#             .batch(batch_size)\
#             .prefetch(tf.data.AUTOTUNE)

#         # Train with device optimization
#         history = model.fit(
#             dataset,
#             epochs=20,
#             verbose=1
#         )

#         return model, history

# # Main training pipeline
# def train_models():
#     setup_training_env()

#     try:
#         print(f"Training on: {get_device()}")

#         print("Training Electric Model...")
#         model_electric, history_electric = train_with_batches(
#             X_train_electric.astype('float32'),
#             y_train_electric.astype('float32'),
#             "electric"
#         )

#         print("\nTraining Gas Model...")
#         model_gas, history_gas = train_with_batches(
#             X_train_gas.astype('float32'),
#             y_train_gas.astype('float32'),
#             "gas"
#         )

#         return model_electric, model_gas

#     except Exception as e:
#         print(f"Training Error: {e}")
#         raise

# # Execute training
# model_electric, model_gas = train_models()

**Test Purposes**

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [None]:
# # Check and configure MPS device for M2 Mac
# has_mps = torch.backends.mps.is_available()
# if has_mps:
#     device = torch.device("mps")
#     print("Using MPS device")
# else:
#     device = torch.device("cpu")
#     print("MPS device not found, using CPU")

# # Add after MPS device setup and before model initialization
# def prepare_data_for_training(X, y, batch_size=32):
#     """Convert numpy arrays to PyTorch DataLoader"""
#     X_tensor = torch.FloatTensor(X)
#     y_tensor = torch.FloatTensor(y).reshape(-1, 1)
#     dataset = TensorDataset(X_tensor, y_tensor)
#     return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# # Create dataloaders
# train_loader_electric = prepare_data_for_training(X_train_electric, y_train_electric)
# train_loader_gas = prepare_data_for_training(X_train_gas, y_train_gas)


# def train_model(model, train_loader, model_name, num_epochs=6):
#     checkpoint_dir = 'checkpoints'
#     os.makedirs(checkpoint_dir, exist_ok=True)
    
#     criterion = nn.MSELoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.001)

#     # Create epoch progress bar
#     epoch_pbar = tqdm(range(num_epochs), desc=f'Training {model_name}', position=0)
    
#     for epoch in epoch_pbar:
#         model.train()
#         running_loss = 0.0
        
#         # Create batch progress bar
#         batch_pbar = tqdm(train_loader, 
#                          desc=f'Epoch {epoch+1}/{num_epochs}',
#                          leave=False, 
#                          position=1)
        
#         for inputs, targets in batch_pbar:
#             inputs = inputs.to(device)
#             targets = targets.to(device)

#             optimizer.zero_grad()
#             outputs = model(inputs)
#             loss = criterion(outputs, targets)
#             loss.backward()
#             optimizer.step()
            
#             running_loss += loss.item()
#             batch_pbar.set_postfix({'loss': f'{loss.item():.4f}'})

#         epoch_loss = running_loss / len(train_loader)
#         epoch_pbar.set_postfix({'loss': f'{epoch_loss:.4f}'})

#         # Save checkpoint
#         checkpoint_path = os.path.join(checkpoint_dir, f'{model_name}_epoch{epoch+1}.pt')
#         torch.save({
#             'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'loss': epoch_loss,
#         }, checkpoint_path)

#     print(f"\n{model_name} training complete!")
#     return model

# # Initialize models
# model_electric = LSTMModel(
#     input_size=len(feature_columns), 
#     hidden_size=32, 
#     num_layers=2, 
#     output_size=1
# ).to(device)

# model_gas = LSTMModel(
#     input_size=len(feature_columns), 
#     hidden_size=32, 
#     num_layers=2, 
#     output_size=1
# ).to(device)

# # Replace the training calls with:
# print("Training Electric Model...")
# model_electric = train_model(model_electric, train_loader_electric, "electric")

# print("\nTraining Gas Model...")
# model_gas = train_model(model_gas, train_loader_gas, "gas")

**5. Modell evaluieren & Vorhersagen interpretieren**
1. Vorhersagen auf Testdaten durchführen
2. Metriken berechnen (RMSE, MAE, R^2)
3. XAI mit SHAP oder LIME anwenden

In [None]:
import torch
import torch.nn as nn
import numpy as np
import lime
import lime.lime_tabular
import shap
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import os

# Configure device for M2 Mac
has_mps = torch.backends.mps.is_available()
if has_mps:
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("MPS device not found, using CPU")

class SHAPWrapper(nn.Module):
    """Enhanced LSTM wrapper for SHAP"""
    def __init__(self, model):
        super().__init__()
        self.model = model
    
    def forward(self, x):
        # Handle input shape for LSTM
        if len(x.shape) == 2:
            # Reshape (batch_size, features) to (batch_size, sequence_length, features)
            x = x.view(x.size(0), -1, 24)  # Assuming 24 features per timestep
        return self.model(x).squeeze()

def calculate_metrics(predictions, actuals):
    """Calculate RMSE, MAE, and R2 metrics"""
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    return rmse, mae, r2

def load_model_weights(model, model_name, epoch):
    checkpoint_path = f'checkpoints/{model_name}_epoch{epoch}.pt'
    print(f'Loading model weights from: {checkpoint_path}')
    model.load_state_dict(torch.load(checkpoint_path, map_location=device)['model_state_dict'])
    model.eval()
    return model

def make_predictions(model, test_loader):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, desc="Making predictions"):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            predictions.append(outputs.to('cpu').numpy())
            actuals.append(targets.to('cpu').numpy())
    return np.concatenate(predictions), np.concatenate(actuals)

def apply_shap_optimized(model, test_data, max_samples=50):
    """Optimized SHAP implementation with shape handling"""
    print("Moving model to CPU for SHAP analysis...")
    model_cpu = model.to('cpu')
    wrapped_model = SHAPWrapper(model_cpu)
    wrapped_model.eval()
    
    # Prepare data with correct shape
    if len(test_data.shape) == 2:
        test_data = test_data.reshape(-1, 90, 24)  # Reshape to (samples, timesteps, features)
    
    # Sample and convert data
    sample_indices = np.random.choice(len(test_data), min(max_samples, len(test_data)), replace=False)
    test_data_subset = test_data[sample_indices]
    
    # Convert to tensor with correct shape
    test_data_tensor = torch.FloatTensor(test_data_subset)
    background = test_data_tensor[:min(10, len(test_data_subset))]
    
    try:
        print("Computing SHAP values...")
        explainer = shap.GradientExplainer(wrapped_model, background)
        shap_values = explainer.shap_values(test_data_tensor)
        
        # Convert to numpy and reshape if needed
        if isinstance(shap_values, list):
            shap_values = shap_values[0]
        shap_values = np.array(shap_values)
        
        # Reshape for visualization
        final_shape = (len(sample_indices), -1)
        shap_values = shap_values.reshape(final_shape)
        
        return shap_values, test_data_subset.reshape(final_shape)
        
    finally:
        model.to(device)


    try:
        plt.figure(figsize=(12, 8))
        
        # Use bar plot instead of violin plot
        shap.summary_plot(
            shap_values,
            test_data,
            feature_names=feature_names,
            plot_type="bar",
            show=False
        )
        
        plt.title(f'Feature Importance for {model_name} Model')
        plt.tight_layout()
        plt.savefig(f'shap_{model_name.lower()}_importance.png', dpi=300, bbox_inches='tight')
        plt.close()
    except Exception as e:
        print(f"Error plotting SHAP values: {str(e)}")

def apply_lime_batched(model, test_data, feature_names, max_samples=50, batch_size=10):
    """Apply LIME with batching and memory management"""
    
    def predict_fn(x):
        with torch.no_grad():
            tensor_x = torch.tensor(x, dtype=torch.float32).to(device)
            return model(tensor_x).to('cpu').numpy()
    
    # Sample subset of test data
    np.random.seed(42)
    sample_indices = np.random.choice(len(test_data), min(max_samples, len(test_data)), replace=False)
    test_data_subset = test_data[sample_indices]
    
    explainer = lime.lime_tabular.LimeTabularExplainer(
        training_data=np.array(test_data_subset),
        feature_names=feature_names,
        mode='regression',
        verbose=False
    )
    
    lime_explanations = []
    
    # Process in batches
    for i in tqdm(range(0, len(test_data_subset), batch_size), desc="Applying LIME"):
        batch = test_data_subset[i:i + batch_size]
        batch_explanations = []
        
        for sample in batch:
            try:
                exp = explainer.explain_instance(
                    sample, 
                    predict_fn,
                    num_features=len(feature_names)
                )
                batch_explanations.append(exp)
            except Exception as e:
                print(f"Error processing sample {i}: {str(e)}")
                continue
                
        lime_explanations.extend(batch_explanations)
        
        # Clear memory
        if has_mps:
            torch.mps.empty_cache()
    
    return lime_explanations

# Evaluation pipeline
# Update evaluate_model function
def evaluate_model(model, model_name, test_loader, test_data):
    print(f"\nEvaluating {model_name} model...")
    
    with tqdm(total=2, desc=f"{model_name} Analysis") as pbar:
        # Predictions and metrics
        predictions, actuals = make_predictions(model, test_loader)
        rmse, mae, r2 = calculate_metrics(predictions, actuals)
        pbar.update(1)
        
        print(f"Metrics for {model_name}:")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"R2: {r2:.4f}")
        
        # LIME analysis with batching
        print("\nGenerating LIME explanations...")
        lime_explanations = apply_lime_batched(
            model, 
            test_data, 
            feature_columns,
            max_samples=50,  # Reduce number of samples
            batch_size=10    # Process in smaller batches
        )
        pbar.update(1)
        
        return predictions, actuals, lime_explanations

# Update data loading with progress bars
print("Preparing data loaders...")
with tqdm(total=2, desc="Loading datasets") as pbar:
    # For Electric data
    test_dataset_electric = torch.utils.data.TensorDataset(
        torch.FloatTensor(X_test_electric).to(device),
        torch.FloatTensor(y_test_electric).to(device)
    )
    test_loader_electric = torch.utils.data.DataLoader(
        test_dataset_electric, 
        batch_size=16, 
        shuffle=False
    )
    pbar.update(1)
    
    # For Gas data
    test_dataset_gas = torch.utils.data.TensorDataset(
        torch.FloatTensor(X_test_gas).to(device),
        torch.FloatTensor(y_test_gas).to(device)
    )
    test_loader_gas = torch.utils.data.DataLoader(
        test_dataset_gas, 
        batch_size=16, 
        shuffle=False
    )
    pbar.update(1)



model_electric = LSTMModel(input_size=len(feature_columns), hidden_size=32, num_layers=2, output_size=1).to(device)
model_gas = LSTMModel(input_size=len(feature_columns), hidden_size=32, num_layers=2, output_size=1).to(device)

# Define models dictionary
models = {
    'Electric': (
        load_model_weights(model_electric, 'electric', epoch=6),
        test_loader_electric,
        X_test_electric
    ),
    'Gas': (
        load_model_weights(model_gas, 'gas', epoch=6),
        test_loader_gas,
        X_test_gas
    )
}

# Run evaluation with progress tracking
for model_name, (model, loader, test_data) in tqdm(models.items(), desc="Evaluating models"):
    predictions, actuals, lime_exps = evaluate_model(model, model_name, loader, test_data)  # Updated unpacking
