In [None]:
# Core Libraries
import pandas as pd
import numpy as np
import os
import joblib
import sys
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn
from sklearn.model_selection import GridSearchCV, GroupKFold # GroupKFold for robust CV if needed later, but main split is fixed
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import train_models

# Models
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb

# --- Notebook Specific Configurations ---
INPUT_FILE = 'New_Features_Added_ALL.csv' # Make sure this file is generated by process_data.py
TARGET_COL = 'SOH_cycle_capacity_%'

# Define fixed numbers for battery splits
N_TEST_BATTERIES = 5
N_VAL_BATTERIES = 2
SPLIT_RANDOM_SEED = 2024

EXTRA_EXCLUDE_COLS_FROM_FEATURES = [
    'capacity_Ah', 'energy_Wh',
    'capacity_Ah_roll_mean_3', 'capacity_Ah_roll_std_3', 'capacity_Ah_diff_3',
    'capacity_Ah_roll_mean_5', 'capacity_Ah_roll_std_5', 'capacity_Ah_diff_5',
    'capacity_Ah_roll_mean_10', 'capacity_Ah_roll_std_10', 'capacity_Ah_diff_10',
    'energy_Wh_roll_mean_3', 'energy_Wh_roll_std_3', 'energy_Wh_diff_3',
    'energy_Wh_roll_mean_5', 'energy_Wh_roll_std_5', 'energy_Wh_diff_5',
    'energy_Wh_roll_mean_10', 'energy_Wh_roll_std_10', 'energy_Wh_diff_10',
    'SOH_cycle_capacity_%_roll_mean_3', 'SOH_cycle_capacity_%_roll_std_3', 'SOH_cycle_capacity_%_diff_3',
    'SOH_cycle_capacity_%_roll_mean_5', 'SOH_cycle_capacity_%_roll_std_5', 'SOH_cycle_capacity_%_diff_5',
    'SOH_cycle_capacity_%_roll_mean_10', 'SOH_cycle_capacity_%_roll_std_10', 'SOH_cycle_capacity_%_diff_10'
]


MODELS_TO_TRAIN = ['rf', 'gb', 'xgb', 'lr'] # Models to tune and train
SAVE_ARTIFACTS = True
EXPERIMENT_NAME = "SOH_Prediction_FixedSplit_Run3" # Give a descriptive name
BASE_OUTPUT_DIR = "soh_prediction_outputs"
EXPERIMENT_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, EXPERIMENT_NAME)

os.makedirs(EXPERIMENT_OUTPUT_DIR, exist_ok=True)
print(f"Experiment output will be saved to: {EXPERIMENT_OUTPUT_DIR}")

# Define battery regime mapping
BATTERY_REGIME_MAP = {
    'battery00': 'regular_constant', 'battery01': 'regular_constant',
    'battery10': 'regular_constant', 'battery11': 'regular_constant',
    'battery20': 'regular_constant', 'battery21': 'regular_constant',
    'battery30': 'regular_constant', 'battery31': 'regular_constant',
    'battery40': 'regular_constant', 'battery50': 'regular_constant',
    'battery22': 'regular_variable', 'battery23': 'regular_variable',
    'battery41': 'regular_variable', 'battery51': 'regular_variable',
    'battery52': 'regular_variable',
    'battery02': 'recommissioned_two_stage', 'battery12': 'recommissioned_two_stage',
    'battery24': 'recommissioned_two_stage', 'battery32': 'recommissioned_two_stage',
    'battery53': 'recommissioned_two_stage',
    'battery03': 'recommissioned_three_stage', 'battery25': 'recommissioned_three_stage',
    'battery33': 'recommissioned_three_stage',
}
ALL_REGIMES = sorted(list(set(BATTERY_REGIME_MAP.values())))

# --- GridSearchCV Configurations ---
CV_FOLDS_GRIDSEARCH = 3 # Folds for inner CV of GridSearchCV
GRIDSEARCH_SCORING = 'neg_mean_squared_error' # Or 'r2', 'neg_mean_absolute_error'

PARAM_GRIDS = {
    'rf': {
        'n_estimators': [50, 100, 150],
        'max_depth': [10, 20, 25],
        'min_samples_split': [5, 10]
    },
    'gb': {
        'learning_rate': [0.05, 0.1],
        'max_iter': [150, 200, 250],
        'max_depth': [4, 5, 7]
    },
    'xgb': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.05, 0.1],
        'max_depth': [5, 7, 9]
    },
    'lr': {
        'fit_intercept': [True]
    }
}

plt.style.use('seaborn-v0_8-whitegrid')

Experiment output will be saved to: soh_prediction_outputs/SOH_Prediction_FixedSplit_Run2


In [2]:
# If not using train_models.py module, define functions here:
def load_data(filepath):
    """Loads data from the specified CSV file."""
    if not os.path.exists(filepath):
        print(f"Error: File not found at {filepath}")
        return None
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully from {filepath}. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(df, target_col_name, current_extra_exclude_cols=None):
    """
    Prepares data for training:
    1. Drops rows with NaN in the target column.
    2. Separates features (X) and target (y).
    3. Selects features, converts types if necessary.
    `current_extra_exclude_cols` is a list of additional columns to remove from features.
    """
    if df is None or df.empty:
        print("Error: Input DataFrame to preprocess_data is None or empty.")
        return None, None, None
        
    df_cleaned = df.dropna(subset=[target_col_name]).copy()
    if df_cleaned.empty:
        print(f"Error: No data remaining after dropping NaNs in target column '{target_col_name}'.")
        return None, None, None

    y = df_cleaned[target_col_name]

    base_excluded_cols = ['start_time', 'cycle_number', 'battery_id', 'regime'] # Add 'regime' and 'battery_id' here
    
    # Combine standard exclusions, the target itself, SOH (if not target), and any extras passed
    excluded_cols_from_features = list(set(
        base_excluded_cols +
        [target_col_name, 'SOH_cycle_capacity_%'] + # SOH_cycle_capacity_% is often the target or closely related
        (current_extra_exclude_cols if current_extra_exclude_cols else [])
    ))
    
    potential_feature_cols_candidates = [col for col in df_cleaned.columns if col not in excluded_cols_from_features]
    
    temp_X_df = df_cleaned[potential_feature_cols_candidates].copy()

    if 'is_reference_cycle' in temp_X_df.columns:
        temp_X_df['is_reference_cycle'] = temp_X_df['is_reference_cycle'].astype(int)

    feature_cols_for_X = []
    for col in temp_X_df.columns:
        if temp_X_df[col].dtype == 'object':
            try: # Attempt to convert object columns to numeric if possible
                temp_X_df[col] = pd.to_numeric(temp_X_df[col])
                feature_cols_for_X.append(col)
                print(f"Converted object column '{col}' to numeric.")
            except ValueError:
                print(f"Warning: Skipping object type column '{col}' as it could not be converted to numeric.")
                continue
        else:
            feature_cols_for_X.append(col)
    
    feature_cols_for_X = sorted(list(set(feature_cols_for_X)))
    X = temp_X_df[feature_cols_for_X].copy()

    print(f"Target variable: {target_col_name}")
    print(f"Number of features selected: {len(feature_cols_for_X)}")
    if X.empty: print("Error: X is empty after feature selection.")
    if y.empty: print("Error: y is empty.")
    print(f"Shape of X (features for model): {X.shape}, Shape of y: {y.shape}")
        
    return X, y, feature_cols_for_X # Return actual feature names

def scale_features(X_train, X_val, X_test, feature_names):
    """
    Scales features using StandardScaler.
    Assumes X_train, X_val, X_test are DataFrames and already imputed.
    Returns scaled DataFrames and the scaler.
    """
    scaler = StandardScaler()
    
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names, index=X_train.index)
    
    X_val_scaled = None
    if X_val is not None and not X_val.empty:
        X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=feature_names, index=X_val.index)
    
    X_test_scaled = None
    if X_test is not None and not X_test.empty:
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=feature_names, index=X_test.index)
        
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler

# --- END OF CELL 2 IF DEFINING FUNCTIONS HERE ---

In [3]:
df_master_full = load_data(INPUT_FILE) # or train_models.load_data(INPUT_FILE)

if df_master_full is None:
    raise ValueError(f"Failed to load data from {INPUT_FILE}.")

# Ensure 'cycle_number' is numeric for sorting
df_master_full['cycle_number'] = pd.to_numeric(df_master_full['cycle_number'], errors='coerce')
df_master_full.dropna(subset=['cycle_number'], inplace=True)
df_master_full['cycle_number'] = df_master_full['cycle_number'].astype(int)

# Annotate with 'regime'
df_master_full['regime'] = df_master_full['battery_id'].map(BATTERY_REGIME_MAP)
if df_master_full['regime'].isnull().any():
    unmapped_batteries = df_master_full[df_master_full['regime'].isnull()]['battery_id'].unique()
    print(f"Warning: Unmapped batteries found: {unmapped_batteries}. These will be excluded from splits if they remain unmapped.")
    # Option: df_master_full.dropna(subset=['regime'], inplace=True) # Or handle them based on policy

print("--- Data Loaded and Regimes Annotated ---")
print(f"Master DataFrame shape: {df_master_full.shape}")
print("Regime counts (batteries per regime):")
print(df_master_full.groupby('regime')['battery_id'].nunique())
print(f"Total unique batteries in loaded data: {df_master_full['battery_id'].nunique()}")

Data loaded successfully from New_Features_Added_ALL.csv. Shape: (8220, 200)
--- Data Loaded and Regimes Annotated ---
Master DataFrame shape: (8220, 201)
Regime counts (batteries per regime):
regime
recommissioned_three_stage     3
recommissioned_two_stage       5
regular_constant              10
regular_variable               5
Name: battery_id, dtype: int64
Total unique batteries in loaded data: 23


In [4]:
def get_fixed_stratified_battery_splits(
    df_with_regime,
    n_test_batteries,
    n_val_batteries,
    random_seed,
    regime_col='regime',
    battery_id_col='battery_id'
):
    """
    Splits batteries into fixed-size, stratified train, validation, and test sets.
    Ensures batteries with no regime mapping are excluded from the split process.
    """
    # Filter out batteries with no regime mapping
    df_mappable = df_with_regime.dropna(subset=[regime_col])
    if len(df_mappable) < len(df_with_regime):
        print(f"Dropped {len(df_with_regime) - len(df_mappable)} rows with missing regimes before splitting.")

    all_batteries_info = df_mappable[[battery_id_col, regime_col]].drop_duplicates().reset_index(drop=True)
    unique_batteries = all_batteries_info[battery_id_col].unique()

    if len(unique_batteries) < (n_test_batteries + n_val_batteries):
        raise ValueError(
            f"Not enough unique mappable batteries ({len(unique_batteries)}) to create test ({n_test_batteries}) "
            f"and validation ({n_val_batteries}) sets."
        )

    np.random.seed(random_seed)
    
    test_battery_ids = []
    val_battery_ids = []
    
    # Stratified selection for test set
    # Group by regime and sample, then supplement if needed
    batteries_by_regime = all_batteries_info.groupby(regime_col)[battery_id_col].apply(list)
    
    # Shuffle regimes to avoid bias in picking order
    available_regimes = list(batteries_by_regime.index)
    np.random.shuffle(available_regimes)

    # Pick for TEST set
    for regime in available_regimes:
        if len(test_battery_ids) >= n_test_batteries: break
        candidates = [b for b in batteries_by_regime[regime] if b not in test_battery_ids]
        np.random.shuffle(candidates)
        can_take = n_test_batteries - len(test_battery_ids)
        actual_take = min(len(candidates), can_take)
        test_battery_ids.extend(candidates[:actual_take])
        
    # If not enough from stratified, fill randomly from remaining
    remaining_for_test_fill = [b for b in unique_batteries if b not in test_battery_ids]
    np.random.shuffle(remaining_for_test_fill)
    if len(test_battery_ids) < n_test_batteries:
        needed = n_test_batteries - len(test_battery_ids)
        test_battery_ids.extend(remaining_for_test_fill[:needed])

    # Pick for VALIDATION set (from batteries not in test set)
    available_for_val = [b for b in unique_batteries if b not in test_battery_ids]
    for regime in available_regimes: # Iterate through regimes again for val
        if len(val_battery_ids) >= n_val_batteries: break
        candidates = [b for b in batteries_by_regime.get(regime, []) if b in available_for_val and b not in val_battery_ids]
        np.random.shuffle(candidates)
        can_take = n_val_batteries - len(val_battery_ids)
        actual_take = min(len(candidates), can_take)
        val_battery_ids.extend(candidates[:actual_take])

    # If not enough from stratified for val, fill randomly
    remaining_for_val_fill = [b for b in available_for_val if b not in val_battery_ids]
    np.random.shuffle(remaining_for_val_fill)
    if len(val_battery_ids) < n_val_batteries:
        needed = n_val_batteries - len(val_battery_ids)
        val_battery_ids.extend(remaining_for_val_fill[:needed])

    train_battery_ids = [
        b for b in unique_batteries if b not in test_battery_ids and b not in val_battery_ids
    ]
    np.random.shuffle(train_battery_ids) # Shuffle train IDs too

    # Sanity checks
    assert len(set(train_battery_ids).intersection(set(test_battery_ids))) == 0, "Overlap: Train-Test"
    assert len(set(train_battery_ids).intersection(set(val_battery_ids))) == 0, "Overlap: Train-Val"
    assert len(set(val_battery_ids).intersection(set(test_battery_ids))) == 0, "Overlap: Val-Test"
    assert len(train_battery_ids) + len(val_battery_ids) + len(test_battery_ids) == len(unique_batteries), "Mismatch in total batteries"
    assert len(test_battery_ids) == n_test_batteries, f"Expected {n_test_batteries} test batteries, got {len(test_battery_ids)}"
    assert len(val_battery_ids) == n_val_batteries, f"Expected {n_val_batteries} validation batteries, got {len(val_battery_ids)}"
    if not train_battery_ids : print("CRITICAL WARNING: Training set is empty!")


    print(f"--- Battery Split (Seed: {random_seed}) ---")
    print(f"Training Batteries ({len(train_battery_ids)}): {sorted(train_battery_ids)}")
    print(f"Validation Batteries ({len(val_battery_ids)}): {sorted(val_battery_ids)}")
    print(f"Test Batteries ({len(test_battery_ids)}): {sorted(test_battery_ids)}")
    
    # Optional: print regime distribution for each set
    for name, ids in zip(["Train", "Validation", "Test"], [train_battery_ids, val_battery_ids, test_battery_ids]):
        if ids: # Check if the list is not empty
            regimes = all_batteries_info[all_batteries_info[battery_id_col].isin(ids)][regime_col].value_counts().to_dict()
            print(f"  {name} Regimes: {regimes}")
        else:
            print(f"  {name} Regimes: No batteries in this set.")


    return train_battery_ids, val_battery_ids, test_battery_ids

# Perform the split
train_ids, val_ids, test_ids = get_fixed_stratified_battery_splits(
    df_master_full,
    n_test_batteries=N_TEST_BATTERIES,
    n_val_batteries=N_VAL_BATTERIES,
    random_seed=SPLIT_RANDOM_SEED
)

--- Battery Split (Seed: 123) ---
Training Batteries (16): ['battery00', 'battery01', 'battery02', 'battery03', 'battery10', 'battery11', 'battery12', 'battery20', 'battery21', 'battery24', 'battery30', 'battery31', 'battery32', 'battery40', 'battery50', 'battery53']
Validation Batteries (2): ['battery25', 'battery33']
Test Batteries (5): ['battery22', 'battery23', 'battery41', 'battery51', 'battery52']
  Train Regimes: {'regular_constant': 10, 'recommissioned_two_stage': 5, 'recommissioned_three_stage': 1}
  Validation Regimes: {'recommissioned_three_stage': 2}
  Test Regimes: {'regular_variable': 5}


In [5]:
def plot_actual_vs_predicted_soh(
    df_test_eval, # DataFrame with 'battery_id', 'cycle_number', actual_soh_col, predicted_soh_col
    battery_id_to_plot,
    model_name_str,
    output_dir,
    actual_soh_col='y_actual', # Ensure these column names match what's in df_test_eval
    predicted_soh_col='y_pred'
):
    """Plots actual vs. predicted SOH for a specific battery and saves it."""
    battery_data = df_test_eval[df_test_eval['battery_id'] == battery_id_to_plot].sort_values('cycle_number')
    
    if battery_data.empty:
        print(f"Warning: No data found for battery {battery_id_to_plot} in the evaluation results. Skipping plot.")
        return
    if len(battery_data) < 2: # Handle edge case from previous discussion
         print(f"Warning: Battery {battery_id_to_plot} has fewer than 2 data points. Plot might be uninformative.")


    plt.figure(figsize=(12, 6))
    plt.plot(battery_data['cycle_number'], battery_data[actual_soh_col], label='Actual SOH', marker='o', linestyle='-')
    plt.plot(battery_data['cycle_number'], battery_data[predicted_soh_col], label=f'Predicted SOH ({model_name_str})', marker='x', linestyle='--')
    
    plt.title(f'Actual vs. Predicted SOH for Battery: {battery_id_to_plot} (Model: {model_name_str.upper()})')
    plt.xlabel('Cycle Number')
    plt.ylabel('SOH (%)')
    plt.legend()
    plt.grid(True)
    
    plot_filename = os.path.join(output_dir, f"soh_comparison_{battery_id_to_plot}_{model_name_str}.png")
    try:
        plt.savefig(plot_filename)
        print(f"  Saved SOH comparison plot to {plot_filename}")
    except Exception as e:
        print(f"  Error saving SOH comparison plot for {battery_id_to_plot}: {e}")
    plt.close() # Close the plot to free memory

def plot_feature_importances(
    importances_df, # DataFrame with 'feature' and 'importance' columns
    model_name_str,
    output_dir,
    top_n=15
):
    """Plots feature importances as a horizontal bar chart and saves it."""
    if importances_df is None or importances_df.empty:
        print(f"No feature importances to plot for {model_name_str}.")
        return

    importances_df = importances_df.sort_values(by='importance', ascending=False).head(top_n)
    
    plt.figure(figsize=(10, max(6, len(importances_df) * 0.4))) # Adjust height based on N features
    sns.barplot(x='importance', y='feature', data=importances_df, hue='feature', palette='viridis', legend=False, dodge=False)
    plt.title(f'Top {top_n} Feature Importances: {model_name_str.upper()}')
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    
    plot_filename = os.path.join(output_dir, f"feature_importances_{model_name_str}.png")
    try:
        plt.savefig(plot_filename)
        print(f"  Saved feature importance plot to {plot_filename}")
    except Exception as e:
        print(f"  Error saving feature importance plot for {model_name_str}: {e}")
    plt.close()

In [6]:
def evaluate_model_on_set(
    model,
    model_name_str, # e.g., "rf_tuned"
    X_data,
    y_data,
    feature_names_list, # For feature importance calculation
    set_name, # "Validation" or "Test"
    output_dir_for_plots, # For saving plots
    battery_ids_for_this_set=None, # Series/list of battery_ids corresponding to X_data rows
    plot_soh_curves=False # Flag to control SOH curve plotting (usually True for Test set)
):
    """
    Evaluates a fitted model on a given dataset, optionally plots SOH curves and feature importances.
    """
    results_metrics = {}
    feature_importance_df_model = None # Initialize
    print(f"\nEvaluating {model_name_str.upper()} on {set_name} Set...")

    rmse, mae, r2 = np.nan, np.nan, np.nan
    y_pred = None # Initialize y_pred

    if X_data is not None and not X_data.empty and y_data is not None and not y_data.empty:
        if len(X_data) != len(y_data):
            print(f"  Error: X_data length ({len(X_data)}) and y_data length ({len(y_data)}) mismatch for {set_name}. Skipping.")
        else:
            try:
                y_pred = model.predict(X_data)
                rmse = np.sqrt(mean_squared_error(y_data, y_pred))
                mae = mean_absolute_error(y_data, y_pred)
                r2 = r2_score(y_data, y_pred)
                print(f"  {set_name} Metrics - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")
            except Exception as e:
                print(f"  Error during {set_name} evaluation for {model_name_str}: {e}")
    else:
        print(f"  {set_name} data is empty or y_data is empty. Skipping {set_name} metrics.")

    results_metrics = {'rmse': rmse, 'mae': mae, 'r2': r2}

    # Feature Importances (calculated once, typically using training data or general model property)
    # This function now focuses on PLOTTING them if provided, or extracting them if simple.
    # The main feature importance calculation can be done after GridSearchCV.
    # For now, let's assume we primarily get it from model attributes.
    if feature_names_list:
        importances_values = None
        model_type_for_fi = model_name_str.split('_')[0] # e.g., 'rf' from 'rf_tuned'

        if hasattr(model, 'feature_importances_'):
            importances_values = model.feature_importances_
        elif model_type_for_fi == 'lr' and hasattr(model, 'coef_'):
            importances_values = np.abs(model.coef_)
            if len(importances_values.shape) > 1: # If multi-target (not expected here)
                importances_values = np.mean(importances_values, axis=0)
        
        if importances_values is not None:
            if len(feature_names_list) == len(importances_values):
                feature_importance_df_model = pd.DataFrame({'feature': feature_names_list, 'importance': importances_values})
                feature_importance_df_model = feature_importance_df_model.sort_values(by='importance', ascending=False) # Keep full, plot top N later
                # Plotting of FI will be called separately if desired, after getting this df
            else:
                print(f"  Warning: Mismatch in feature names ({len(feature_names_list)}) and importances ({len(importances_values)}) for {model_name_str}.")
        elif set_name.lower() == "test": # Only warn if we expect FI and don't have it for test eval display
             print(f"  Warning: Model {model_name_str} does not have standard 'feature_importances_' or 'coef_'.")


    # Plot SOH curves for each battery in this set (if applicable, e.g., for test set)
    if plot_soh_curves and y_pred is not None and battery_ids_for_this_set is not None and not X_data.empty :
        print(f"  Generating SOH comparison plots for {set_name} set...")
        # We need 'cycle_number' for plotting. It's not in X_data.
        # We need to retrieve it from the original df slice for this set.
        # This implies `evaluate_model_on_set` needs access to the original df_subset or its relevant columns.
        # For simplicity, let's assume the calling function prepares a df_eval_results.

        # Create a temporary DataFrame for plotting
        # Ensure y_data (actual) and y_pred (predicted) are Series with the same index as X_data.
        # The battery_ids_for_this_set should also align with this index.
        # Cycle numbers need to be fetched by joining back to the original data using index or battery_id and original cycle numbers.
        
        # THIS PART REQUIRES CAREFUL INDEX ALIGNMENT.
        # Assuming y_data is a Series and X_data is a DataFrame, both from the same split.
        # Let's make df_eval_results in the main experiment runner.
        df_eval_temp = pd.DataFrame({
            'battery_id': battery_ids_for_this_set.values, # Ensure this is aligned with X_data.index
            'cycle_number': df_master_full.loc[X_data.index, 'cycle_number'].values, # Fetch cycle_number using original index
            'y_actual': y_data.values,
            'y_pred': y_pred
        })

        for batt_id in df_eval_temp['battery_id'].unique():
            plot_actual_vs_predicted_soh(
                df_test_eval=df_eval_temp,
                battery_id_to_plot=batt_id,
                model_name_str=model_name_str,
                output_dir=output_dir_for_plots,
                actual_soh_col='y_actual',
                predicted_soh_col='y_pred'
            )
            
    return results_metrics, feature_importance_df_model # Return metrics and the full FI dataframe

In [7]:
def run_main_experiment(
    experiment_tag, # e.g., EXPERIMENT_NAME
    df_full,
    train_battery_ids,
    val_battery_ids,
    test_battery_ids,
    target_variable_name,
    extra_feature_exclusions,
    models_to_run,
    model_param_grids,
    gridsearch_cv_folds,
    gridsearch_metric_name,
    base_artifact_dir, # Where to save everything for this experiment
    save_all_artifacts=True
):
    """
    Runs the main SOH prediction experiment:
    1. Filters data for train, validation, test sets based on battery IDs.
    2. Preprocesses data (impute & scale).
    3. For each model, performs GridSearchCV on training data.
    4. Evaluates the best model from GridSearchCV on validation and test sets.
    5. Saves models, scaler, imputer, plots, and results.
    """
    print(f"\n{'='*20} Starting Experiment: {experiment_tag} {'='*20}")
    current_exp_output_dir = os.path.join(base_artifact_dir) # Already includes experiment_name
    # os.makedirs(current_exp_output_dir, exist_ok=True) # Already created in Cell 1

    # --- 1. Filter Data for Train/Validation/Test using provided IDs ---
    df_train = df_full[df_full['battery_id'].isin(train_battery_ids)].copy()
    df_val = df_full[df_full['battery_id'].isin(val_battery_ids)].copy()
    df_test = df_full[df_full['battery_id'].isin(test_battery_ids)].copy()

    print(f"Data split: Train batteries={len(train_battery_ids)}, Val batteries={len(val_battery_ids)}, Test batteries={len(test_battery_ids)}")
    print(f"Train data shape: {df_train.shape}, Val data shape: {df_val.shape}, Test data shape: {df_test.shape}")

    if df_train.empty:
        print("ERROR: Training data is empty. Aborting experiment.")
        return None
    if df_test.empty:
        print("ERROR: Test data is empty. Aborting experiment.")
        return None
    # Validation set can be empty if N_VAL_BATTERIES was 0 (though we set it > 0)

    # --- 2. Preprocess Data (Extract X, y for each set) ---
    # Note: preprocess_data drops 'battery_id' and 'regime' from X
    X_train_raw, y_train, actual_feature_names = preprocess_data(df_train, target_variable_name, extra_feature_exclusions)
    X_val_raw, y_val, _ = preprocess_data(df_val, target_variable_name, extra_feature_exclusions)
    X_test_raw, y_test, _ = preprocess_data(df_test, target_variable_name, extra_feature_exclusions)

    if X_train_raw is None or X_train_raw.empty:
        print("Error: X_train is empty after preprocessing. Aborting.")
        return None
    # Store original indices for later use if needed (e.g. detailed error analysis, fetching other columns)
    train_indices = X_train_raw.index
    val_indices = X_val_raw.index if X_val_raw is not None else None
    test_indices = X_test_raw.index if X_test_raw is not None else None


    # --- 3. Impute Data (Fit on Train, Transform Val/Test) ---
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_raw), columns=actual_feature_names, index=train_indices)
    
    X_val_imputed = None
    if X_val_raw is not None and not X_val_raw.empty:
        X_val_imputed = pd.DataFrame(imputer.transform(X_val_raw), columns=actual_feature_names, index=val_indices)
    
    X_test_imputed = None
    if X_test_raw is not None and not X_test_raw.empty:
        X_test_imputed = pd.DataFrame(imputer.transform(X_test_raw), columns=actual_feature_names, index=test_indices)

    # --- 4. Scale Data (Fit on Train, Transform Val/Test) ---
    # scale_features now returns X_train_scaled, X_val_scaled, X_test_scaled, scaler
    X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_features(
        X_train_imputed, X_val_imputed, X_test_imputed, actual_feature_names
    )

    experiment_summary_results = {} # To store metrics for all models
    all_trained_models = {} # To store the best_estimator from GridSearchCV
    all_feature_importances = {} # To store FI dataframes

    # --- 5. GridSearchCV for each model ---
    for model_name_code in models_to_run: # e.g., 'rf', 'gb'
        print(f"\n--- Tuning {model_name_code.upper()} ---")
        
        # Determine data to use (scaled or unscaled)
        current_X_train_for_tuning = X_train_imputed
        current_X_val_for_eval = X_val_imputed # For evaluation after tuning
        current_X_test_for_eval = X_test_imputed
        
        if model_name_code in ['lr', 'gb']: # 'gb' is HistGradientBoostingRegressor
            current_X_train_for_tuning = X_train_scaled
            current_X_val_for_eval = X_val_scaled
            current_X_test_for_eval = X_test_scaled
            print(f"  Using SCALED data for {model_name_code.upper()}")
        else:
            print(f"  Using IMPUTED (unscaled) data for {model_name_code.upper()}")

        # Initialize base model
        base_model = None
        if model_name_code == 'rf': base_model = RandomForestRegressor(random_state=SPLIT_RANDOM_SEED, n_jobs=-1)
        elif model_name_code == 'gb': base_model = HistGradientBoostingRegressor(random_state=SPLIT_RANDOM_SEED)
        elif model_name_code == 'xgb': base_model = xgb.XGBRegressor(random_state=SPLIT_RANDOM_SEED, n_jobs=-1)
        elif model_name_code == 'lr': base_model = LinearRegression()
        else:
            print(f"Warning: Unknown model '{model_name_code}'. Skipping.")
            continue
        
        # GridSearchCV
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=model_param_grids[model_name_code],
            scoring=gridsearch_metric_name,
            cv=gridsearch_cv_folds,
            verbose=1,
            n_jobs=-1
        )
        print(f"  Starting GridSearchCV for {model_name_code.upper()}...")
        grid_search.fit(current_X_train_for_tuning, y_train) # y_train corresponds to X_train_raw/imputed/scaled
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"  Best parameters for {model_name_code.upper()}: {best_params}")
        
        all_trained_models[model_name_code] = best_model
        
        # --- 6. Evaluate the BEST tuned model ---
        model_eval_results = {'best_params': best_params}
        
        # Evaluate on Validation Set (if val set exists)
        val_metrics, fi_df_val = (None, None)
        if current_X_val_for_eval is not None and not current_X_val_for_eval.empty and y_val is not None and not y_val.empty:
            val_metrics, fi_df_val = evaluate_model_on_set(
                best_model, f"{model_name_code}_tuned", current_X_val_for_eval, y_val, actual_feature_names,
                "Validation", current_exp_output_dir,
                battery_ids_for_this_set=df_val['battery_id'], # Pass battery IDs for val set
                plot_soh_curves=False # Typically no SOH curves for validation set
            )
            model_eval_results['val_metrics'] = val_metrics
        else:
            model_eval_results['val_metrics'] = {'rmse': np.nan, 'mae': np.nan, 'r2': np.nan}
            print("  Validation set is empty or not available, skipping validation evaluation.")

        # Evaluate on Test Set
        test_metrics, fi_df_test = (None, None)
        if current_X_test_for_eval is not None and not current_X_test_for_eval.empty and y_test is not None and not y_test.empty:
            # For test set evaluation, we need battery IDs and cycle numbers associated with X_test_for_eval
            # These are on df_test which has the same original indices as X_test_raw (and thus X_test_for_eval)
            test_battery_ids_series = df_test.loc[X_test_raw.index, 'battery_id']

            test_metrics, fi_df_test = evaluate_model_on_set(
                best_model, f"{model_name_code}_tuned", current_X_test_for_eval, y_test, actual_feature_names,
                "Test", current_exp_output_dir,
                battery_ids_for_this_set=test_battery_ids_series, # Pass battery IDs for test set
                plot_soh_curves=True # Plot SOH curves for the test set
            )
            model_eval_results['test_metrics'] = test_metrics
        else:
            model_eval_results['test_metrics'] = {'rmse': np.nan, 'mae': np.nan, 'r2': np.nan}
            print("  Test set is empty or not available, skipping test evaluation.")

        # Store feature importances (use the one from test evaluation or val if test is empty)
        # The fi_df is calculated from the model attributes, so it's the same regardless of val/test data used for eval.
        final_fi_df = fi_df_test if fi_df_test is not None else fi_df_val
        if final_fi_df is not None and not final_fi_df.empty:
            all_feature_importances[model_name_code] = final_fi_df
            # Plot feature importances
            plot_feature_importances(final_fi_df, f"{model_name_code}_tuned", current_exp_output_dir)
        
        experiment_summary_results[model_name_code] = model_eval_results

    # --- 7. Save Artifacts ---
    if save_all_artifacts:
        joblib.dump(scaler, os.path.join(current_exp_output_dir, f"scaler_{experiment_tag}.joblib"))
        joblib.dump(imputer, os.path.join(current_exp_output_dir, f"imputer_{experiment_tag}.joblib"))
        joblib.dump(actual_feature_names, os.path.join(current_exp_output_dir, f"feature_names_{experiment_tag}.joblib"))
        print(f"\nScaler, Imputer, and Feature Names saved for {experiment_tag} in {current_exp_output_dir}")

        for model_name_code, model_instance in all_trained_models.items():
            model_filename = os.path.join(current_exp_output_dir, f"model_{model_name_code}_tuned_{experiment_tag}.joblib")
            joblib.dump(model_instance, model_filename)
        print(f"Tuned models saved for {experiment_tag} in {current_exp_output_dir}")
        
        # Save feature importances dataframes
        if all_feature_importances:
            fi_path = os.path.join(current_exp_output_dir, f"all_feature_importances_{experiment_tag}.joblib")
            joblib.dump(all_feature_importances, fi_path)
            print(f"Feature importances data saved to {fi_path}")
            
    return experiment_summary_results, all_trained_models

In [8]:
if not train_ids or not test_ids: # Basic check
    print("CRITICAL: Training or Test battery IDs are empty. Cannot run experiment.")
    # Add more robust checks if N_VAL_BATTERIES > 0 and val_ids is empty
elif N_VAL_BATTERIES > 0 and not val_ids:
     print("CRITICAL: Validation batteries are configured (N_VAL_BATTERIES > 0) but validation ID list is empty. Cannot run experiment.")
else:
    final_experiment_results, final_trained_models = run_main_experiment(
        experiment_tag=EXPERIMENT_NAME,
        df_full=df_master_full,
        train_battery_ids=train_ids,
        val_battery_ids=val_ids,
        test_battery_ids=test_ids,
        target_variable_name=TARGET_COL,
        extra_feature_exclusions=EXTRA_EXCLUDE_COLS_FROM_FEATURES,
        models_to_run=MODELS_TO_TRAIN,
        model_param_grids=PARAM_GRIDS,
        gridsearch_cv_folds=CV_FOLDS_GRIDSEARCH,
        gridsearch_metric_name=GRIDSEARCH_SCORING,
        base_artifact_dir=EXPERIMENT_OUTPUT_DIR, # Pass the specific output dir for this experiment
        save_all_artifacts=SAVE_ARTIFACTS
    )
    print("\n--- Main Experiment Run Completed ---")


Data split: Train batteries=16, Val batteries=2, Test batteries=5
Train data shape: (5669, 201), Val data shape: (895, 201), Test data shape: (1656, 201)
Target variable: SOH_cycle_capacity_%
Number of features selected: 168
Shape of X (features for model): (5669, 168), Shape of y: (5669,)
Target variable: SOH_cycle_capacity_%
Number of features selected: 168
Shape of X (features for model): (895, 168), Shape of y: (895,)
Target variable: SOH_cycle_capacity_%
Number of features selected: 168
Shape of X (features for model): (1656, 168), Shape of y: (1656,)

--- Tuning RF ---
  Using IMPUTED (unscaled) data for RF
  Starting GridSearchCV for RF...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
  Best parameters for RF: {'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 100}

Evaluating RF_TUNED on Validation Set...
  Validation Metrics - RMSE: 1.0443, MAE: 0.8028, R2: 0.9741

Evaluating RF_TUNED on Test Set...
  Test Metrics - RMSE: 2.7080, MAE: 1.7331, R2: 0.8404


In [10]:
print("\n--- Consolidated Experiment Results Summary ---")

results_list_for_df = []
if 'final_experiment_results' in locals() and final_experiment_results:
    for model_name, model_summary in final_experiment_results.items():
        print(f"\nModel: {model_name.upper()}")
        print(f"  Best Params: {model_summary.get('best_params', 'N/A')}")

        val_metrics = model_summary.get('val_metrics', {})
        test_metrics = model_summary.get('test_metrics', {})

        val_rmse = val_metrics.get('rmse', float('nan'))
        val_mae = val_metrics.get('mae', float('nan'))
        val_r2 = val_metrics.get('r2', float('nan'))
        
        test_rmse = test_metrics.get('rmse', float('nan'))
        test_mae = test_metrics.get('mae', float('nan'))
        test_r2 = test_metrics.get('r2', float('nan'))

        print(f"  Validation Set: RMSE={val_rmse:.4f}, MAE={val_mae:.4f}, R2={val_r2:.4f}")
        print(f"  Test Set:       RMSE={test_rmse:.4f}, MAE={test_mae:.4f}, R2={test_r2:.4f}")
        
        entry = {
            'Experiment': EXPERIMENT_NAME, # Using the global experiment name
            'Model': model_name.upper(),
            'Best Params': str(model_summary.get('best_params', 'N/A')),
            'Val RMSE': val_rmse,
            'Val MAE': val_mae,
            'Val R2': val_r2,
            'Test RMSE': test_rmse,
            'Test MAE': test_mae,
            'Test R2': test_r2,
            'Status': 'Completed'
        }
        results_list_for_df.append(entry)

    if results_list_for_df:
        df_results_summary = pd.DataFrame(results_list_for_df)
        
        ordered_cols = [
            'Experiment', 'Model', 'Status', 'Best Params',
            'Val RMSE', 'Val MAE', 'Val R2',
            'Test RMSE', 'Test MAE', 'Test R2'
        ]
        final_cols_to_use = [col for col in ordered_cols if col in df_results_summary.columns]
        df_results_summary = df_results_summary[final_cols_to_use]

        print("\n\n--- Experiment Results DataFrame ---")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000) # Adjust for your display
        print(df_results_summary)

        # Save the results dataframe
        csv_path = os.path.join(EXPERIMENT_OUTPUT_DIR, f"experiment_summary_{EXPERIMENT_NAME}.csv")
        try:
            df_results_summary.to_csv(csv_path, index=False)
            print(f"\nResults summary saved to {csv_path}")
        except Exception as e:
            print(f"\nError saving results summary CSV: {e}")
    else:
        print("\nNo results to create a summary DataFrame.")
else:
    print("\nNo experiment results found to display. Please check the execution of Cell 8.")

print(f"\nAll plots and artifacts saved in: {EXPERIMENT_OUTPUT_DIR}")


--- Consolidated Experiment Results Summary ---

Model: RF
  Best Params: {'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 100}
  Validation Set: RMSE=1.0443, MAE=0.8028, R2=0.9741
  Test Set:       RMSE=2.7080, MAE=1.7331, R2=0.8404

Model: GB
  Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'max_iter': 200}
  Validation Set: RMSE=0.9063, MAE=0.6927, R2=0.9805
  Test Set:       RMSE=2.1643, MAE=1.5050, R2=0.8980

Model: XGB
  Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 150}
  Validation Set: RMSE=0.8179, MAE=0.6162, R2=0.9841
  Test Set:       RMSE=2.4095, MAE=1.6271, R2=0.8736

Model: LR
  Best Params: {'fit_intercept': True}
  Validation Set: RMSE=0.7766, MAE=0.5918, R2=0.9857
  Test Set:       RMSE=3.3772, MAE=2.0598, R2=0.7517


--- Experiment Results DataFrame ---
                       Experiment Model     Status                                        Best Params  Val RMSE   Val MAE    Val R2  Test RMSE  Test MAE   Test R2
0  SOH_Predictio