In [12]:
import pandas as pd
import numpy as np
import os

In [13]:
# --- Configuration Constants ---
DATASET_ROOT_DIRECTORY = "battery_alt_dataset"  # IMPORTANT: SET THIS TO YOUR DATASET PATH
REGULAR_BATTERIES_FOLDER = "regular_alt_batteries"
RECOMMISSIONED_BATTERIES_FOLDER = "recommissioned_batteries"
FOLDERS_TO_PROCESS = [REGULAR_BATTERIES_FOLDER, RECOMMISSIONED_BATTERIES_FOLDER]

# Column mapping for standardization across different CSV formats
COLUMN_MAPPING = {
    "start_time": "start_time",
    "time": "relative_time_s",          # Relative time in seconds
    "mode": "mode",                     # Operation mode (-1 for discharge)
    "voltage_charger": "voltage_charger",
    "temperature_battery": "temp_battery_C",    # Temperature in Celsius
    "voltage_load": "voltage_load_V",           # Voltage in Volts
    "current_load": "current_load_A",           # Current in Amperes
    "temperature_mosfet": "temp_mosfet_C",
    "temperature_resistor": "temp_resistor_C",
    "mission_type": "mission_type"             # 0 for reference, 1 for regular discharge
}

# Expected columns after standardization
EXPECTED_FINAL_COLUMNS = list(COLUMN_MAPPING.values())

# SOH/RUL Parameters
NOMINAL_CAPACITY_AH = 2.5
SOH_EOL_THRESHOLD_PERCENT = 80.0

# Cycle Filtering Parameters
MIN_ACCEPTABLE_STABLE_VOLTAGE_V = 4.0
MINIMUM_VALID_CYCLE_DURATION_S = 10
MINIMUM_DATA_POINTS_PER_CYCLE = 5

# Internal Resistance Calculation Parameters
IR_CALCULATION_DURATION_S = 3.0  # Duration in seconds at the start of discharge for IR calculation
MIN_CURRENT_FOR_IR_CALC_A = 0.5 # Minimum average current to attempt IR calculation

In [14]:
def load_battery_data(file_path):
    """
    Loads and preprocesses battery data from a CSV file.
    
    Args:
        file_path (str): Path to the CSV file
        
    Returns:
        pd.DataFrame: Processed DataFrame or None if loading fails
    """
    try:
        df = pd.read_csv(file_path, low_memory=False)
        
        # Convert numeric columns
        numeric_columns = ['time', 'voltage_charger', 'temperature_battery', 
                         'voltage_load', 'current_load', 'temperature_mosfet', 
                         'temperature_resistor']
        
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Convert categorical columns
        if 'mode' in df.columns:
            df['mode'] = pd.to_numeric(df['mode'], errors='coerce').fillna(99).astype(int) # 99 for unknown mode
        if 'mission_type' in df.columns:
            df['mission_type'] = pd.to_numeric(df['mission_type'], errors='coerce').fillna(0).astype(int) # Default to reference
        
        # Standardize column names
        df.rename(columns=COLUMN_MAPPING, inplace=True, errors='ignore')

        # Validate essential columns
        required_cols = ['relative_time_s', 'mode', 'voltage_load_V', 
                        'current_load_A', 'temp_battery_C', 'mission_type']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"Warning: Missing essential columns {missing_cols} in {file_path}")
            # Fill missing essential columns with NaN or appropriate defaults if critical for downstream processing
            for col in missing_cols:
                if col == 'mission_type':
                    df[col] = 0 # Default mission type
                elif col == 'mode':
                    df[col] = 99 # Default mode
                else:
                    df[col] = np.nan
            # return None # Option to skip file if critical columns are missing
        return df
    except FileNotFoundError:
        print(f"Error: File not found {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

In [18]:
def segment_discharge_cycles(df, battery_id):
    """
    Segments battery data into individual discharge cycles.
    
    Args:
        df (pd.DataFrame): Raw battery data
        battery_id (str): Identifier for the battery
        
    Returns:
        list: List of dictionaries containing cycle information
    """
    cycles = []
    if df is None or df.empty:
        return cycles

    in_discharge = False
    current_cycle_data = []
    cycle_number = 0

    # Ensure 'mode' column exists and is numeric
    if 'mode' not in df.columns:
        print(f"Warning: 'mode' column missing in data for {battery_id}. Cannot segment cycles.")
        return cycles
    df['mode'] = pd.to_numeric(df['mode'], errors='coerce').fillna(99).astype(int)


    for i, row in df.iterrows():
        if row['mode'] == -1:  # Discharge mode
            if not in_discharge:
                in_discharge = True
                current_cycle_data = []
            current_cycle_data.append(row)
        else:  # Non-discharge mode
            if in_discharge:
                in_discharge = False
                if current_cycle_data:
                    cycle_df = pd.DataFrame(current_cycle_data)
                    is_reference = False # Default
                    if 'mission_type' in cycle_df.columns and not cycle_df.empty:
                         # Ensure mission_type is treated as numeric before comparison
                        mission_type_series = pd.to_numeric(cycle_df['mission_type'], errors='coerce')
                        if not mission_type_series.empty and not pd.isna(mission_type_series.iloc[0]):
                            is_reference = mission_type_series.iloc[0] == 0

                    cycles.append({
                        "battery_id": battery_id,
                        "cycle_number": cycle_number,
                        "cycle_df": cycle_df,
                        "is_reference": is_reference
                    })
                    cycle_number += 1
                current_cycle_data = []
    
    # Handle last cycle if data ends during discharge
    if in_discharge and current_cycle_data:
        cycle_df = pd.DataFrame(current_cycle_data)
        is_reference = False # Default
        if 'mission_type' in cycle_df.columns and not cycle_df.empty:
            mission_type_series = pd.to_numeric(cycle_df['mission_type'], errors='coerce')
            if not mission_type_series.empty and not pd.isna(mission_type_series.iloc[0]):
                is_reference = mission_type_series.iloc[0] == 0
        cycles.append({
            "battery_id": battery_id,
            "cycle_number": cycle_number,
            "cycle_df": cycle_df,
            "is_reference": is_reference
        })
    return cycles

In [19]:
def filter_invalid_cycles(segmented_cycles_list, battery_id):
    """
    Filters out invalid discharge cycles and trims initial anomalous voltage readings.
    
    Args:
        segmented_cycles_list (list): List of cycle dictionaries
        battery_id (str): Battery identifier
        
    Returns:
        list: Filtered list of valid cycles, with their DataFrames potentially trimmed.
    """
    valid_cycles_info = []
    new_cycle_number = 0

    for cycle_info_original in segmented_cycles_list:
        cycle_df_raw = cycle_info_original["cycle_df"].copy()

        temp_df = cycle_df_raw.copy()
        critical_cols_for_filtering = ['relative_time_s', 'voltage_load_V', 'current_load_A']
        
        missing_crit_cols = [col for col in critical_cols_for_filtering if col not in temp_df.columns]
        if missing_crit_cols:
            # print(f"      FILTER: Cycle (orig no. {cycle_info_original['cycle_number']}) for {battery_id} missing critical columns {missing_crit_cols} for filtering. Excluding.")
            continue
            
        for col in critical_cols_for_filtering:
            temp_df[col] = pd.to_numeric(temp_df[col], errors='coerce')
        
        temp_df.dropna(subset=critical_cols_for_filtering, inplace=True)

        if temp_df.empty or len(temp_df) < MINIMUM_DATA_POINTS_PER_CYCLE:
            # print(f"      FILTER: Cycle (orig no. {cycle_info_original['cycle_number']}) for {battery_id} empty or too few data points ({len(temp_df)}) after NaN drop. Excluding.")
            continue

        first_valid_row_index = None
        # Iterate using .iloc for positional access after potential index changes from dropna
        for i in range(len(temp_df)):
            if temp_df['voltage_load_V'].iloc[i] >= MIN_ACCEPTABLE_STABLE_VOLTAGE_V:
                first_valid_row_index = temp_df.index[i] # Get original index
                break
        
        if first_valid_row_index is None:
            # print(f"      FILTER: Cycle (orig no. {cycle_info_original['cycle_number']}) for {battery_id} never reached acceptable voltage ({MIN_ACCEPTABLE_STABLE_VOLTAGE_V}V). Excluding.")
            continue
        
        # Slice the original cycle_df_raw using the identified valid index from temp_df
        # This ensures we keep all original columns, not just those in temp_df
        trimmed_cycle_df = cycle_df_raw.loc[first_valid_row_index:].copy()

        if trimmed_cycle_df.empty or len(trimmed_cycle_df) < MINIMUM_DATA_POINTS_PER_CYCLE:
            # print(f"      FILTER: Cycle (orig no. {cycle_info_original['cycle_number']}) for {battery_id} became too short after trimming. Excluding.")
            continue
            
        time_s_trimmed = pd.to_numeric(trimmed_cycle_df['relative_time_s'], errors='coerce').dropna()
        if len(time_s_trimmed) < 2: # Need at least two points to calculate duration
            # print(f"      FILTER: Cycle (orig no. {cycle_info_original['cycle_number']}) for {battery_id} has insufficient time data after trimming. Excluding.")
            continue

        duration_s_trimmed = time_s_trimmed.iloc[-1] - time_s_trimmed.iloc[0]
        if duration_s_trimmed < MINIMUM_VALID_CYCLE_DURATION_S:
            # print(f"      FILTER: Cycle (orig no. {cycle_info_original['cycle_number']}) for {battery_id} too short ({duration_s_trimmed:.2f}s) after trimming. Excluding.")
            continue

        valid_cycle_info_item = cycle_info_original.copy()
        valid_cycle_info_item["cycle_df"] = trimmed_cycle_df
        valid_cycle_info_item["cycle_number"] = new_cycle_number
        
        valid_cycles_info.append(valid_cycle_info_item)
        new_cycle_number += 1
        
    return valid_cycles_info

In [20]:
def extract_cycle_features(cycle_info):
    """
    Extracts features from a single discharge cycle.
    
    Args:
        cycle_info (dict): Dictionary containing cycle information
        
    Returns:
        dict: Dictionary of extracted features
    """
    cycle_df_original = cycle_info["cycle_df"].copy()
    
    features = {
        "battery_id": cycle_info["battery_id"],
        "cycle_number": cycle_info["cycle_number"],
        "is_reference_cycle": cycle_info["is_reference"],
        "internal_resistance_ohm": np.nan
    }

    feature_cols = ['relative_time_s', 'voltage_load_V', 'current_load_A', 'temp_battery_C']
    for col in feature_cols:
        if col in cycle_df_original.columns:
            cycle_df_original[col] = pd.to_numeric(cycle_df_original[col], errors='coerce')
        else:
            # print(f"Warning: Feature column {col} missing for cycle {cycle_info['cycle_number']} of battery {cycle_info['battery_id']}. Some features may be NaN.")
            if col not in cycle_df_original.columns: # Add missing column with NaNs
                 cycle_df_original[col] = np.nan


    # Drop rows where essential features for calculation are NaN
    cycle_df = cycle_df_original.dropna(subset=['relative_time_s', 'voltage_load_V', 'current_load_A']).copy()

    if cycle_df.empty or len(cycle_df) < 2: # Need at least 2 points for diff()
        # print(f"Warning: Not enough data points after NaN drop for cycle {cycle_info['cycle_number']} of battery {cycle_info['battery_id']}. Returning basic features.")
        return features # Return basic features if not enough data

    # Time features
    time_s = cycle_df['relative_time_s'].reset_index(drop=True)
    features['discharge_duration_s'] = time_s.iloc[-1] - time_s.iloc[0]
    dt_s = time_s.diff().fillna(0) # fill first NaN with 0 for calculations

    # Current and capacity features
    current_A = cycle_df['current_load_A'].reset_index(drop=True)
    features['avg_current_A'] = current_A.mean()
    features['capacity_Ah'] = np.sum(current_A * dt_s) / 3600.0

    # Voltage features
    voltage_V = cycle_df['voltage_load_V'].reset_index(drop=True)
    features['avg_voltage_V'] = voltage_V.mean()
    features['start_voltage_V'] = voltage_V.iloc[0]
    features['end_voltage_V'] = voltage_V.iloc[-1]
    features['delta_voltage_V'] = voltage_V.iloc[0] - voltage_V.iloc[-1]

    # Temperature features
    if 'temp_battery_C' in cycle_df.columns and cycle_df['temp_battery_C'].notna().any():
        temp_C = cycle_df['temp_battery_C'].dropna().reset_index(drop=True)
        if not temp_C.empty:
            features['avg_temp_C'] = temp_C.mean()
            features['start_temp_C'] = temp_C.iloc[0]
            features['end_temp_C'] = temp_C.iloc[-1]
            features['delta_temp_C'] = temp_C.iloc[-1] - temp_C.iloc[0] if len(temp_C) > 1 else 0
            features['max_temp_C'] = temp_C.max()
        else: # All temp data was NaN
            features['avg_temp_C'] = np.nan
            features['start_temp_C'] = np.nan
            features['end_temp_C'] = np.nan
            features['delta_temp_C'] = np.nan
            features['max_temp_C'] = np.nan


    # Energy features
    power_W = voltage_V * current_A
    features['avg_power_W'] = power_W.mean()
    features['energy_Wh'] = np.sum(power_W * dt_s) / 3600.0

    if len(time_s) > 1 and len(voltage_V) > 1 and len(current_A) > 1:
        elapsed_time_in_cycle = time_s - time_s.iloc[0]
        
        # Find index for IR_CALCULATION_DURATION_S
        # We need at least two points for the calculation (start and end of segment)
        # idx_t_check should be at least 1 (the second point in the series)
        ir_segment_indices = elapsed_time_in_cycle[elapsed_time_in_cycle <= IR_CALCULATION_DURATION_S].index
        
        if len(ir_segment_indices) > 1: # Need at least two points for the segment
            idx_t_check = ir_segment_indices[-1] # Last index within the duration
            if idx_t_check == 0 and len(elapsed_time_in_cycle) > 1 : # if duration is too short, use the next available point
                idx_t_check = 1
            elif idx_t_check == 0: # not enough points at all
                idx_t_check = -1 # invalid state

            if idx_t_check > 0 : # Ensure we have at least up to the second point
                v_start_segment = voltage_V.iloc[0]
                v_at_t_check = voltage_V.iloc[idx_t_check]
                
                # Calculate average current over the selected segment (from index 0 to idx_t_check)
                i_avg_segment = current_A.iloc[0 : idx_t_check + 1].mean()

                if pd.notna(i_avg_segment) and abs(i_avg_segment) >= MIN_CURRENT_FOR_IR_CALC_A:
                    delta_v_segment = v_start_segment - v_at_t_check
                    if delta_v_segment >= 0: # Expect voltage to drop or stay same
                        features['internal_resistance_ohm'] = delta_v_segment / i_avg_segment
                    # else: IR remains NaN if voltage unexpectedly rises
                # else: IR remains NaN if current is too low
        # else: IR remains NaN if cycle is too short or not enough points in segment

    return features

In [21]:
def calculate_soh_rul(processed_df, nominal_capacity_ah=NOMINAL_CAPACITY_AH, soh_eol_threshold=SOH_EOL_THRESHOLD_PERCENT):
    """
    Calculates State of Health (SOH) and (optionally, though removed from final output) Remaining Useful Life (RUL).
    
    Args:
        processed_df (pd.DataFrame): DataFrame containing processed cycle data including reference cycles.
        nominal_capacity_ah (float): Nominal capacity in Ah.
        soh_eol_threshold (float): SOH threshold for End of Life (EOL) in percent.
        
    Returns:
        pd.DataFrame: DataFrame with added SOH and q_initial_Ah columns.
    """
    df = processed_df.copy()
    df['SOH_%'] = np.nan
    df['q_initial_Ah'] = np.nan
    # df['RUL_cycles'] = np.nan # RUL calculation is present but column is dropped later as per original script

    for batt_id in df['battery_id'].unique():
        batt_cycles_df = df[df['battery_id'] == batt_id].sort_values('cycle_number').copy() # Use .copy() to avoid SettingWithCopyWarning
        
        q_initial = nominal_capacity_ah # Default
        # Try to find Q_initial from the first reference cycle
        ref_cycles_for_q_initial = batt_cycles_df[batt_cycles_df['is_reference_cycle'] == True]
        
        if not ref_cycles_for_q_initial.empty and 'capacity_Ah' in ref_cycles_for_q_initial.columns:
            first_ref_capacity = ref_cycles_for_q_initial['capacity_Ah'].iloc[0]
            if pd.notna(first_ref_capacity) and first_ref_capacity > 0:
                q_initial = first_ref_capacity
            # If first ref capacity is invalid, try next ones or first overall cycle
            else: 
                valid_ref_caps = ref_cycles_for_q_initial['capacity_Ah'].dropna()
                if not valid_ref_caps.empty and valid_ref_caps.iloc[0] > 0:
                     q_initial = valid_ref_caps.iloc[0]
                elif not batt_cycles_df.empty and 'capacity_Ah' in batt_cycles_df.columns:
                    first_overall_capacity = batt_cycles_df['capacity_Ah'].iloc[0]
                    if pd.notna(first_overall_capacity) and first_overall_capacity > 0:
                        q_initial = first_overall_capacity
                        # print(f"Warning: Using first overall cycle capacity ({q_initial:.2f}Ah) as Q_initial for {batt_id} due to invalid/missing reference cycle capacity.")
        elif not batt_cycles_df.empty and 'capacity_Ah' in batt_cycles_df.columns: # No reference cycles, use first available cycle
            first_overall_capacity = batt_cycles_df['capacity_Ah'].iloc[0]
            if pd.notna(first_overall_capacity) and first_overall_capacity > 0:
                q_initial = first_overall_capacity
                # print(f"Warning: No reference cycles. Using first overall cycle capacity ({q_initial:.2f}Ah) as Q_initial for {batt_id}.")
        # else:
            # print(f"Warning: No valid capacity found. Using nominal capacity ({nominal_capacity_ah}Ah) as Q_initial for {batt_id}.")
            
        # Assign q_initial to all cycles of this battery in the original df
        df.loc[batt_cycles_df.index, 'q_initial_Ah'] = q_initial

        # Calculate SOH, propagating last known reference SOH to non-reference cycles
        last_valid_soh = np.nan
        for index, row in batt_cycles_df.iterrows():
            current_soh_on_cycle = np.nan
            if row['is_reference_cycle'] and pd.notna(row['capacity_Ah']) and row['capacity_Ah'] > 0 and q_initial > 0:
                current_soh_on_cycle = (row['capacity_Ah'] / q_initial) * 100.0
                last_valid_soh = current_soh_on_cycle
            elif not row['is_reference_cycle'] and pd.notna(last_valid_soh): # Propagate for non-reference
                current_soh_on_cycle = last_valid_soh
            # else: SOH remains NaN if no prior valid SOH or if it's a non-ref cycle before any ref SOH
            
            df.loc[index, 'SOH_%'] = current_soh_on_cycle
            
        # RUL Calculation (kept for completeness, though column is dropped later)
        batt_soh_df = df[df['battery_id'] == batt_id].sort_values('cycle_number')
        ref_cycles_with_soh = batt_soh_df[(batt_soh_df['is_reference_cycle'] == True) & (batt_soh_df['SOH_%'].notna())]
        
        eol_cycle_number = -1 # Sentinel for not reached EOL
        if not ref_cycles_with_soh.empty:
            # Find the first reference cycle where SOH drops below threshold
            eol_defining_cycles = ref_cycles_with_soh[ref_cycles_with_soh['SOH_%'] < soh_eol_threshold]
            if not eol_defining_cycles.empty:
                eol_cycle_number = eol_defining_cycles['cycle_number'].min()

        if eol_cycle_number != -1: # EOL was reached
            for index, row in batt_soh_df.iterrows():
                rul = eol_cycle_number - row['cycle_number']
                df.loc[index, 'RUL_cycles'] = max(0, rul) # RUL cannot be negative
        # else: EOL not reached, RUL remains NaN or could be estimated if desired
            # If EOL not reached, RUL could be based on extrapolation or set to a large number/NaN
            # The original script set RUL to 0 for cycles after the last observed reference if EOL not met by then.
            # This might be too pessimistic. For now, let them be NaN if EOL not explicitly found.
            # If you want the original logic for when EOL is not found:
            # elif not ref_cycles_with_soh.empty:
            #     max_obs_ref_cycle = ref_cycles_with_soh['cycle_number'].max()
            #     for index, row in batt_soh_df.iterrows():
            #         if row['cycle_number'] > max_obs_ref_cycle:
            #             df.loc[index, 'RUL_cycles'] = 0
    return df

In [None]:
def process_battery_dataset(root_dir, folders_list, single_battery_id=None):
    """
    Processes battery data files in specified folders.
    Can process all batteries or a single specified battery.
    Extracts information from reference cycles but excludes them from final output.
    
    Args:
        root_dir (str): Root directory containing battery data folders.
        folders_list (list): List of folder names to process.
        single_battery_id (str, optional): If provided, only process the battery with this ID (filename without .csv). Defaults to None (process all).
        
    Returns:
        pd.DataFrame: Processed battery cycle data (excluding reference cycles).
    """
    all_batteries_processed_dfs = []
    found_single_battery = False

    for folder_name in folders_list:
        if found_single_battery and single_battery_id: # Optimization: stop if single battery already processed
            break
        folder_path = os.path.join(root_dir, folder_name)
        if not os.path.isdir(folder_path):
            print(f"Warning: Folder not found {folder_path}. Skipping.")
            continue
        
        if not single_battery_id: # Only print folder if processing all
            print(f"\nProcessing folder: {folder_name}")
        
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".csv"):
                current_battery_id = file_name.replace(".csv", "") # Or os.path.splitext(file_name)[0]

                if single_battery_id and current_battery_id != single_battery_id:
                    continue # Skip if not the target single battery
                
                # If we are here, it's either the single_battery_id or we are processing all
                found_single_battery = True 
                file_path = os.path.join(folder_path, file_name)
                
                print(f"  Processing battery: {current_battery_id}")
                raw_df = load_battery_data(file_path)
                if raw_df is None or raw_df.empty:
                    print(f"    Skipping {current_battery_id} due to loading error or empty data.")
                    if single_battery_id: # If single battery failed to load, stop
                        return pd.DataFrame()
                    continue
                    
                all_segmented_cycles = segment_discharge_cycles(raw_df, current_battery_id)
                if not all_segmented_cycles:
                    print(f"    No discharge cycles segmented for {current_battery_id}.")
                    if single_battery_id: return pd.DataFrame()
                    continue
                
                valid_discharge_cycles = filter_invalid_cycles(all_segmented_cycles, current_battery_id)
                if not valid_discharge_cycles:
                    print(f"    No valid discharge cycles after filtering for {current_battery_id}.")
                    if single_battery_id: return pd.DataFrame()
                    continue
                
                battery_cycle_features_list = []
                for cycle_info in valid_discharge_cycles:
                    features = extract_cycle_features(cycle_info)
                    if len(features) > 3 and pd.notna(features.get('capacity_Ah')): 
                        battery_cycle_features_list.append(features)
                
                if not battery_cycle_features_list:
                    print(f"    No features extracted for any valid cycle of {current_battery_id}.")
                    if single_battery_id: return pd.DataFrame()
                    continue

                battery_features_df = pd.DataFrame(battery_cycle_features_list)
                battery_df_with_soh_q_initial = calculate_soh_rul(battery_features_df)
                
                non_ref_cycles_df = battery_df_with_soh_q_initial[~battery_df_with_soh_q_initial['is_reference_cycle']].copy()
                
                if non_ref_cycles_df.empty:
                    print(f"    No non-reference cycles with features for {current_battery_id}.")
                    if single_battery_id: return pd.DataFrame()
                    continue

                non_ref_cycles_df['cycle_number'] = range(len(non_ref_cycles_df))
                all_batteries_processed_dfs.append(non_ref_cycles_df)

                if single_battery_id: # If we processed the single target battery, we can stop iterating files/folders
                    break 
        
    if single_battery_id and not found_single_battery:
        print(f"Warning: Specified single battery ID '{single_battery_id}' not found in any folder.")
        return pd.DataFrame()
                        
    if not all_batteries_processed_dfs:
        if not single_battery_id: # Only print general message if processing all
            print("No battery data successfully processed from any folder.")
        return pd.DataFrame()
        
    final_df = pd.concat(all_batteries_processed_dfs, ignore_index=True)
    
    columns_to_drop_final = ['RUL_cycles', 'is_reference_cycle'] 
    final_df = final_df.drop(columns=[col for col in columns_to_drop_final if col in final_df.columns], errors='ignore')
    
    return final_df

In [None]:
# --- Main Execution ---

# SET THIS VARIABLE TO A SPECIFIC BATTERY ID (e.g., "0.0", "3.2") TO PROCESS ONLY ONE
# SET TO None TO PROCESS ALL BATTERIES
PROCESS_SINGLE_BATTERY_ID = "battery00"  # Example: "0.0" or "1.2" or "B0005" (whatever your file names are without .csv)
# PROCESS_SINGLE_BATTERY_ID = "0.0" # << UNCOMMENT AND SET TO TEST A SINGLE BATTERY

if "path_to_your_dataset_directory" == DATASET_ROOT_DIRECTORY or not os.path.exists(DATASET_ROOT_DIRECTORY):
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! PLEASE SET THE `DATASET_ROOT_DIRECTORY` in Cell 2 to your actual dataset path !!!")
    print("!!! Example: DATASET_ROOT_DIRECTORY = '/path/to/your/battery_alt_dataset'        !!!")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    master_cycle_df = pd.DataFrame()
else:
    if PROCESS_SINGLE_BATTERY_ID:
        print(f"Attempting to process single battery: {PROCESS_SINGLE_BATTERY_ID}")
    else:
        print(f"Starting battery data processing for all batteries from: {DATASET_ROOT_DIRECTORY}")
    
    master_cycle_df = process_battery_dataset(DATASET_ROOT_DIRECTORY, FOLDERS_TO_PROCESS, single_battery_id=PROCESS_SINGLE_BATTERY_ID)

    if master_cycle_df.empty:
        if PROCESS_SINGLE_BATTERY_ID:
            print(f"\nNo data was processed for battery '{PROCESS_SINGLE_BATTERY_ID}'. Check ID, file, and logs.")
        else:
            print("\nNo data was processed for any battery. Check logs and dataset path.")
    else:
        print("\n\n--- Master DataFrame Head (Non-Reference Cycles) ---")
        display_cols = ['battery_id', 'cycle_number', 'capacity_Ah', 'SOH_%', 
                        'internal_resistance_ohm', 'avg_temp_C', 'discharge_duration_s', 'q_initial_Ah']
        display_cols_present = [col for col in display_cols if col in master_cycle_df.columns]
        print(master_cycle_df[display_cols_present].head())
        
        print(f"\nMaster DataFrame shape: {master_cycle_df.shape}")
        
        unique_batteries = master_cycle_df['battery_id'].nunique()
        print(f"Unique batteries in final DataFrame: {unique_batteries}")
        
        if unique_batteries > 0:
            sample_battery_id = master_cycle_df['battery_id'].unique()[0] # Will be the single one if processed
            sample_battery_stats = master_cycle_df[master_cycle_df['battery_id'] == sample_battery_id][['cycle_number', 'SOH_%', 'internal_resistance_ohm', 'q_initial_Ah']]
            print(f"\n--- SOH, IR, Q_initial for battery {sample_battery_id} (first 5 cycles) ---")
            print(sample_battery_stats.head())

        print("\nProcessing complete. DataFrame 'master_cycle_df' is ready.")
        print("Run the next cell to optionally save the DataFrame to CSV.")

Starting battery data processing from: battery_alt_dataset

Processing folder: regular_alt_batteries
  Processing battery: battery00
  Processing battery: battery01


KeyboardInterrupt: 

In [None]:
# --- Optional: Save Processed Data ---
# Run this cell only if you want to save the 'master_cycle_df' created in the previous cell to a file.

output_path = "processed_battery_cycle_data.csv"

if 'master_cycle_df' in locals() and not master_cycle_df.empty:
    try:
        master_cycle_df.to_csv(output_path, index=False)
        print(f"Processed data successfully saved to: {output_path}")
    except Exception as e:
        print(f"Error saving processed data to {output_path}: {e}")
elif 'master_cycle_df' in locals() and master_cycle_df.empty:
     print("The 'master_cycle_df' is empty. Nothing to save.")
else:
    print("Variable 'master_cycle_df' not found. Please run the previous processing cell first.")

# Note: If you plan to train models, you might run a separate 'train_models.py' or notebook using this saved CSV.