# NASA Battery Data Analysis
This notebook processes and visualizes NASA battery test data.

## 1. Import Libraries
We load the necessary libraries for data processing and visualization.

In [1]:
pip install numpy matplotlib plotly nbformat

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import nbformat
import plotly.graph_objects as go
from ipywidgets import Dropdown, VBox, Output
import re
import os
from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

#### Read and parse the metadata.csv file containing battery test data.

In [3]:
def read_battery_metadata(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    headers = lines[0].strip().split(',')
    data = []
    
    for line in lines[1:]:
        values = line.strip().split(',')
        if len(values) >= len(headers):
            row_dict = {}
            for i, header in enumerate(headers):
                if i < len(values):
                    if header == 'start_time' and '[' in values[i]:
                        time_str = ''.join(values[i:i+6])
                        time_values = re.findall(r'[-+]?\d*\.\d+|\d+', time_str)
                        if len(time_values) >= 6:
                            try:
                                year = int(float(time_values[0]))
                                month = int(float(time_values[1]))
                                day = int(float(time_values[2]))
                                hour = int(float(time_values[3]))
                                minute = int(float(time_values[4]))
                                second = float(time_values[5])
                                datetime_str = f"{year}-{month:02d}-{day:02d} {hour:02d}:{minute:02d}:{int(second):02d}"
                                row_dict[header] = datetime_str
                            except (ValueError, IndexError):
                                row_dict[header] = None
                        else:
                            row_dict[header] = None
                    else:
                        try:
                            value = values[i].strip()
                            if value and value != 'nan':
                                try:
                                    row_dict[header] = float(value)
                                except ValueError:
                                    row_dict[header] = value
                            else:
                                row_dict[header] = None
                        except:
                            row_dict[header] = None
            data.append(row_dict)
    
    return pd.DataFrame(data)

## 2. Extract and Analyze EIS Data
Filtering and structuring electrochemical impedance spectroscopy (EIS) measurements.

#### Extract EIS (Electrochemical Impedance Spectroscopy) data from the metadata.

In [4]:
def extract_eis_data(metadata_df):
    # Same as before
    eis_df = metadata_df[metadata_df['type'] == 'impedance'].copy()
    for col in ['Re', 'Rct']:
        if col in eis_df.columns:
            eis_df[col] = pd.to_numeric(eis_df[col], errors='coerce')
    eis_df = eis_df.sort_values(by=['battery_id', 'test_id'])
    eis_df['cycle'] = eis_df.groupby('battery_id')['test_id'].transform(
        lambda x: (x - x.min()) // 2
    )
    return eis_df

#### Extract EIS (Electrochemical Impedance Spectroscopy) data from the metadata.

In [5]:
def create_interactive_plot_with_lines(eis_df):
    # Get unique battery IDs
    batteries = sorted(eis_df['battery_id'].unique())
    
    # Create figure
    fig = go.Figure()
    
    # Add traces for each battery
    for battery in batteries:
        battery_data = eis_df[eis_df['battery_id'] == battery].sort_values(by='cycle')  # Sort by cycle
        fig.add_trace(
            go.Scatter3d(
                x=battery_data['Re'],
                y=-battery_data['Rct'],  # Negate for conventional EIS display
                z=battery_data['cycle'],
                mode='lines+markers',  # Add both lines and markers
                marker=dict(size=5, color='blue', opacity=0.8),  # Marker color (e.g., red)
                line=dict(color='blue', width=2),  # Line color (e.g., blue)
                name=f"Battery {battery}",
                visible=(battery == batteries[0]),  # Show first battery by default
                hovertemplate=(
                    "Re(Z): %{x:.2f} kΩ<br>" +
                    "-Rct: %{y:.2f} kΩ<br>" +
                    "Cycle: %{z}<extra></extra>"
                )  # Hover template with units
             )
        )
    
    # Dropdown menu for battery selection
    buttons = []
    for i, battery in enumerate(batteries):
        visibility = [False] * len(batteries)
        visibility[i] = True
        buttons.append(
            dict(
                label=f"Battery {battery}",
                method="update",
                args=[{"visible": visibility}, {"title": f"EIS Data for Battery {battery}"}]
            )
        )
    
    fig.update_layout(
        updatemenus=[
            dict(
                buttons=buttons,
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            )
        ],
        scene=dict(
            xaxis_title="Re(Z) (kΩ)",  # X-axis with units
            yaxis_title="-Rct (kΩ)",   # Y-axis with units
            zaxis_title="Cycle Number"  # Z-axis (no units needed)
        ),
        title="Interactive 3D EIS Plot with Units and Connecting Lines",
        width=800,
        height=600
    )
    
    fig.show()

#### Analyze all batteries and create an interactive 3D plot with units and connecting lines.

In [6]:
def analyze_all_batteries_interactive_with_lines(file_path):
    metadata_df = read_battery_metadata(file_path)
    
    battery_count = metadata_df['battery_id'].nunique()
    
    eis_df = extract_eis_data(metadata_df)
    
    batteries = sorted(metadata_df['battery_id'].unique())
    
    # Create interactive plot with units and lines
    create_interactive_plot_with_lines(eis_df)

In [7]:
metadata_file = "metadata.csv"
analyze_all_batteries_interactive_with_lines(metadata_file)

## 3. Save and Display Results
Saving plots for further analysis and displaying interactive visualizations.

### Save plots as images

In [8]:
!python -m battery_plots

Reading metadata from metadata.csv...
Found data for 34 batteries
Extracted 1956 EIS measurements
Batteries in dataset: B0005, B0006, B0007, B0018, B0025, B0026, B0027, B0028, B0029, B0030, B0031, B0032, B0033, B0034, B0036, B0038, B0039, B0040, B0041, B0042, B0043, B0044, B0045, B0046, B0047, B0048, B0049, B0050, B0051, B0052, B0053, B0054, B0055, B0056
Generating plot for battery B0005...
Generating plot for battery B0006...
Generating plot for battery B0007...
Generating plot for battery B0018...
Generating plot for battery B0025...
Generating plot for battery B0026...
Generating plot for battery B0027...
Generating plot for battery B0028...
Generating plot for battery B0029...
Generating plot for battery B0030...
Generating plot for battery B0031...
Generating plot for battery B0032...
Generating plot for battery B0033...
Generating plot for battery B0034...
Generating plot for battery B0036...
Generating plot for battery B0038...
Generating plot for battery B0039...
Generating plo

## 4.Reading Metadata for Task 2

In [19]:
def read_metadata(file_path):
    return pd.read_csv(file_path)

#### Read and process battery data from the specified folder.

In [91]:
def read_battery_data(data_folder, metadata_df):
    battery_data = {}
    required_columns = {'Voltage_measured', 'Current_measured', 'Time'}
    
    for _, row in metadata_df.iterrows():
        battery_id = row['battery_id']
        filename = row['filename']
        file_path = os.path.join(data_folder, filename)
        
        # Skip if file doesn't exist or has already been processed
        if not os.path.exists(file_path) or battery_id in battery_data:
            continue
        
        # Read the data
        try:
            df = pd.read_csv(file_path)
        except Exception:
            continue
        
        # Check for required columns
        if not required_columns.issubset(df.columns):
            continue
        
        # Group data by battery_id
        if battery_id not in battery_data:
            battery_data[battery_id] = []
        battery_data[battery_id].append(df)
    
    # Concatenate all data for each battery
    for battery_id, data_list in battery_data.items():
        battery_data[battery_id] = pd.concat(data_list, ignore_index=True)
    
    return battery_data

#### Compute incremental capacity (dQ/dV) from battery charge/discharge data.

In [21]:
def compute_dQ_dV(df):
    # Compute charge (Q) by integrating current over time
    time_diff = np.gradient(df['Time'])
    charge = np.cumsum(df['Current_measured'] * time_diff)
    
    # Compute dQ/dV using numerical differentiation
    dQ_dV = np.gradient(charge, df['Voltage_measured'])
    return pd.DataFrame({'Voltage_measured': df['Voltage_measured'], 'dQ_dV': dQ_dV})

#### Find peaks in the dQ/dV vs Voltage curve.

In [22]:
def find_peaks_in_dQ_dV(dqdv_df, prominence=0.1):
    voltages = dqdv_df['Voltage_measured'].values
    dqdv_values = dqdv_df['dQ_dV'].values
    peak_indices, _ = find_peaks(dqdv_values, prominence=prominence)
    return [(voltages[i], dqdv_values[i]) for i in peak_indices]

#### Create an interactive 3D plot with a dropdown menu to select which battery to display.

In [24]:
def create_interactive_3d_plot_with_dropdown(metadata_df, battery_data):
    """
    """
    # Get unique battery IDs and sort them
    batteries = sorted(metadata_df['battery_id'].unique())
    fig = go.Figure()
    traces = []
    
    # Generate traces for each battery
    for battery_id in batteries:
        if battery_id not in battery_data:
            continue
        
        # Compute dQ/dV and find peaks
        dqdv_df = compute_dQ_dV(battery_data[battery_id])
        peaks = find_peaks_in_dQ_dV(dqdv_df)
        
        # Prepare data for plotting
        voltages = [peak[0] for peak in peaks]
        heights = [peak[1] for peak in peaks]
        cycles = list(range(1, len(peaks) + 1))
        
        # Add trace for this battery
        traces.append(
            go.Scatter3d(
                x=voltages,
                y=heights,
                z=cycles,
                mode='markers',
                marker=dict(size=5, opacity=0.8),
                name=f"Battery {battery_id}",
                visible=False  # Initially hidden
            )
        )
    
    # Add all traces to the figure
    for trace in traces:
        fig.add_trace(trace)
    
    # Dropdown menu for battery selection
    buttons = []
    
    # Add a "Select" option as the first element in the dropdown
    buttons.append(
        dict(
            label="Select",
            method="update",
            args=[{"visible": [False] * len(traces)}, {"title": "Please select a battery from the dropdown menu"}]
        )
    )
    
    # Add buttons for each battery
    for i, battery_id in enumerate(batteries):
        if battery_id in battery_data:
            visibility = [False] * len(traces)
            visibility[i] = True
            buttons.append(
                dict(
                    label=f"Battery {battery_id}",
                    method="update",
                    args=[{"visible": visibility}, {"title": f"Peak Evolution for Battery {battery_id}"}]
                )
            )
    
    # Add a fallback button if no valid traces exist
    if not buttons:
        buttons.append(
            dict(
                label="No Data Available",
                method="update",
                args=[{"visible": [False] * len(traces)}, {"title": "No Data Available"}]
            )
        )
    
    # Update layout with dropdown menu
    fig.update_layout(
        updatemenus=[
            dict(
                buttons=buttons,
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            )
        ],
        scene=dict(
            xaxis_title="Voltage (V)",
            yaxis_title="Peak Height (dQ/dV)",
            zaxis_title="Cycle Number"
        ),
        title="Please select a battery from the dropdown menu",  # Default title
        width=800,
        height=600
    )
    
    # Show the figure
    fig.show()

#### Analyze battery data and create an interactive 3D plot with dropdown for battery selection.

In [25]:
def analyze_battery_with_dropdown(metadata_file, data_folder):
    metadata_df = read_metadata(metadata_file)
    battery_data = read_battery_data(data_folder, metadata_df)
    create_interactive_3d_plot_with_dropdown(metadata_df, battery_data)

In [26]:
if __name__ == "__main__":
    metadata_file = "metadata.csv"
    data_folder = "data/"
    analyze_battery_with_dropdown(metadata_file, data_folder)

## 5. Building machine learning model.

In [16]:
# Set device to GPU if available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


#### Data Preparation

In [None]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def read_metadata(file_path):
    return pd.read_csv(file_path)

##### Read and process battery data from the specified folder.

In [83]:
def read_battery_data(data_folder, metadata_df):
    battery_data = {}
    required_columns = {'Voltage_measured', 'Current_measured', 'Time'}
    
    for _, row in metadata_df.iterrows():
        battery_id = row['battery_id']
        filename = row['filename']
        file_path = os.path.join(data_folder, filename)
        
        # Skip if file doesn't exist or has already been processed
        if not os.path.exists(file_path) or battery_id in battery_data:
            continue
        
        # Read the data
        try:
            df = pd.read_csv(file_path)
        except Exception:
            continue
        
        # Check for required columns
        if not required_columns.issubset(df.columns):
            continue
        
        # Group data by battery_id
        if battery_id not in battery_data:
            battery_data[battery_id] = []
        battery_data[battery_id].append(df)
    
    # Concatenate all data for each battery
    for battery_id, data_list in battery_data.items():
        battery_data[battery_id] = pd.concat(data_list, ignore_index=True)
    
    return battery_data

##### Compute incremental capacity (dQ/dV) from battery charge/discharge data.

In [None]:
def compute_dQ_dV(df):
    # Compute charge (Q) by integrating current over time
    time_diff = np.gradient(df['Time'])
    charge = np.cumsum(df['Current_measured'] * time_diff)
    
    # Compute dQ/dV using numerical differentiation
    dQ_dV = np.gradient(charge, df['Voltage_measured'])
    return pd.DataFrame({'Voltage_measured': df['Voltage_measured'], 'dQ_dV': dQ_dV})

##### Find peaks in the dQ/dV vs Voltage curve.

In [84]:
def find_peaks_in_dQ_dV(dqdv_df, prominence=0.1):
    """
    """
    voltages = dqdv_df['Voltage_measured'].values
    dqdv_values = dqdv_df['dQ_dV'].values
    peak_indices, _ = find_peaks(dqdv_values, prominence=prominence)
    return [(voltages[i], dqdv_values[i]) for i in peak_indices]

In [85]:
metadata_file = "metadata.csv"
data_folder = "data/"

# Step 1: Read metadata
metadata_df = read_metadata(metadata_file)

# Step 2: Read battery data
battery_data = read_battery_data(data_folder, metadata_df)

# Step 3: Process data for each battery
processed_data = {}
for battery_id, df in battery_data.items():
    dqdv_df = compute_dQ_dV(df)
    peaks = find_peaks_in_dQ_dV(dqdv_df)
    processed_data[battery_id] = {
        'dqdv_df': dqdv_df,
        'peaks': peaks
    }

print("Data preparation complete.")

Data preparation complete.


##### Step 1: Parse complex numbers

In [86]:
def parse_complex(value):
    """
    Parse a string representation of a complex number and return its real part.
    
    Args:
        value (str): String representation of a complex number (e.g., "(0.06-0.03j)")
        
    Returns:
        float: Real part of the complex number, or NaN if parsing fails
    """
    try:
        return complex(value).real
    except (ValueError, TypeError):
        return np.nan

##### Step 2: Extract features

Extract features from metadata and battery data for machine learning. 
   Args:
       metadata_df (pd.DataFrame): Metadata DataFrame
       battery_data (dict): Dictionary containing battery data######        
   Returns:
       pd.DataFrame: Feature DataFrame with columns [Re, Rct, num_peaks, mean_peak_height, Capacity]

In [87]:
def extract_features(metadata_df, battery_data):
    features = []
    
    # Group metadata by battery_id
    grouped_metadata = metadata_df.groupby('battery_id')
    
    for battery_id, group in grouped_metadata:
        # Skip if battery data is missing
        if battery_id not in battery_data:
            continue
        
        # Get the most recent impedance data (Re, Rct)
        impedance_data = group[group['type'] == 'impedance']
        if impedance_data.empty:
            continue
        latest_impedance = impedance_data.iloc[-1]  # Use the most recent impedance test
        
        # Parse Re and Rct (handle complex numbers)
        Re = parse_complex(latest_impedance['Re'])
        Rct = parse_complex(latest_impedance['Rct'])
        
        # Skip if Re or Rct is NaN
        if pd.isna(Re) or pd.isna(Rct):
            continue
        
        # Get the most recent discharge data (Capacity)
        discharge_data = group[group['type'] == 'discharge']
        if discharge_data.empty:
            continue
        latest_discharge = discharge_data.iloc[-1]  # Use the most recent discharge test
        
        # Validate Capacity (ensure it's numeric and not NaN)
        try:
            Capacity = float(latest_discharge['Capacity'])  # Convert to float
            if pd.isna(Capacity):  # Skip if Capacity is NaN
                continue
        except (ValueError, TypeError):
            continue
        
        # Compute dQ/dV and find peaks
        df = battery_data[battery_id]
        dqdv_df = compute_dQ_dV(df)
        peaks = find_peaks_in_dQ_dV(dqdv_df)
        
        # Extract features from peaks
        num_peaks = len(peaks)
        peak_heights = [peak[1] for peak in peaks]
        mean_peak_height = np.mean(peak_heights) if peak_heights else 0
        
        # Append features to the list
        features.append({
            'battery_id': battery_id,
            'test_id': latest_discharge['test_id'],
            'Re': Re,
            'Rct': Rct,
            'num_peaks': num_peaks,
            'mean_peak_height': mean_peak_height,
            'Capacity': Capacity
        })
    
    # Convert features to a DataFrame
    feature_df = pd.DataFrame(features)
    
    # Drop rows with missing values
    feature_df = feature_df.dropna()
    
    return feature_df

##### Step 3: Split the data

In [88]:
def split_data(feature_df, test_size=0.2, random_state=42):
    X = feature_df[['Re', 'Rct', 'num_peaks', 'mean_peak_height']]
    y = feature_df['Capacity']
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

##### Step 4: Train and save the best model

In [89]:
def finalize_and_save_model(X_train, X_test, y_train, y_test):
    try:
        # Combine training and testing data
        X_full = pd.concat([X_train, X_test], axis=0)
        y_full = pd.concat([y_train, y_test], axis=0)
        
        # Initialize and train the best model (Gradient Boosting Regressor)
        best_model = GradientBoostingRegressor(random_state=42)
        best_model.fit(X_full, y_full)
        
        # Save the trained model to a file
        model_name = "gradient_boosting_regressor"
        model_filename = f"{model_name}_model.pkl"
        joblib.dump(best_model, model_filename)
        
        # Display success message
        print(f"Model successfully created: {model_name}")
        print(f"Model saved to {model_filename}")
        
        # Make predictions on the full dataset
        y_pred = best_model.predict(X_full)
        
        # Evaluate the model
        mae = mean_absolute_error(y_full, y_pred)
        mse = mean_squared_error(y_full, y_pred)
        r2 = r2_score(y_full, y_pred)
        
        # Print evaluation metrics
        print("\nFinal Model Evaluation on Full Dataset:")
        print(f"Mean Absolute Error (MAE): {mae:.4f}")
        print(f"Mean Squared Error (MSE): {mse:.4f}")
        print(f"R-squared (R2): {r2:.4f}")
    
    except Exception as e:
        print("An error occurred while creating the model:")
        print(f"Error: {e}")

In [90]:
metadata_file = "metadata.csv"
data_folder = "data/"

# Step 1: Read metadata and battery data
metadata_df = read_metadata(metadata_file)
battery_data = read_battery_data(data_folder, metadata_df)

# Step 2: Extract features
feature_df = extract_features(metadata_df, battery_data)

# Step 3: Split the data
X_train, X_test, y_train, y_test = split_data(feature_df)

# Step 4: Finalize the model and make predictions
finalize_and_save_model(X_train, X_test, y_train, y_test)

Model successfully created: gradient_boosting_regressor
Model saved to gradient_boosting_regressor_model.pkl

Final Model Evaluation on Full Dataset:
Mean Absolute Error (MAE): 0.0111
Mean Squared Error (MSE): 0.0002
R-squared (R2): 0.9990


#### Conclusion
##### Model created and saved as .pkl file