<a href="https://colab.research.google.com/github/sidharthdk/BMS-Data-pre-processing-files/blob/main/Calce_Dataset_cleaning_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import kagglehub
import os
import pandas as pd
import numpy as np
import zipfile
from pathlib import Path

# Download the Kaggle dataset (sidharthdk/calce-dataset-battery-archive from notebook)
dataset_path = kagglehub.dataset_download("sidharthdk/calce-dataset-battery-archive")
print(f"Dataset downloaded to: {dataset_path}")
raw_dir = Path(dataset_path)
clean_dir = Path("cleaned")
zip_filename = "cleaned_battery_data.zip"

clean_dir.mkdir(exist_ok=True)

# Configuration matching the notebook
CONFIG = {
    'voltage_min': 0.0,  # Allow full range
    'voltage_max': 5.0,
    'interp_max_gap': 10,  # More forgiving gaps
    'drop_bad_cycles': True  # Auto-drop bad ones
}

# Utility functions from the notebook
def standardize_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('__+', '_', regex=False).str.replace('[^a-z0-9_]', '', regex=True)
    return df

def enforce_numeric(df):
    for c in df.columns:
        if c != 'cycle_index':
            df[c] = pd.to_numeric(df[c], errors='coerce')
    return df

def remove_empty_columns(df):
    return df.dropna(axis=1, how='all')

def drop_duplicates_early(df):
    return df.drop_duplicates()

def validate_columns(df):
    required = ['cycle_index', 'test_time_s', 'voltage_v', 'current_a']
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing critical columns: {missing}")

def handle_missing(df, config):
    df = df.sort_values(['cycle_index', 'test_time_s'])
    signal_cols = [c for c in df.columns if c in ['voltage_v', 'current_a', 'cell_temperature_c', 'environment_temperature_c']]
    state_cols = [c for c in df.columns if 'capacity' in c or 'energy' in c]
    if signal_cols:
        df[signal_cols] = df[signal_cols].interpolate(method='linear', limit=config['interp_max_gap'], limit_direction='both')
    if state_cols:
        df[state_cols] = df[state_cols].ffill()
    # Removed df = df.dropna() to investigate aggressive dropping
    return df

def enforce_physics(df, config):
    if 'voltage_v' in df.columns:
        df = df[(df['voltage_v'] >= config['voltage_min']) & (df['voltage_v'] <= config['voltage_max'])]
    for col in ['charge_capacity_ah', 'discharge_capacity_ah', 'charge_energy_wh', 'discharge_energy_wh']:
        if col in df.columns:
            df = df[df[col] >= 0]
    return df

def remove_outliers_cycle_wise(df):
    if df.empty:
        return df
    cleaned = []
    for _, cdf in df.groupby('cycle_index'):
        current_cdf = cdf.copy()
        for col in ['charge_capacity_ah', 'discharge_capacity_ah']:
            if col in current_cdf.columns and len(current_cdf) >= 4:
                Q1 = current_cdf[col].quantile(0.25)
                Q3 = current_cdf[col].quantile(0.75)
                IQR = Q3 - Q1
                current_cdf = current_cdf[
                    (current_cdf[col] >= Q1 - 1.5 * IQR) &
                    (current_cdf[col] <= Q3 + 1.5 * IQR)
                ]
        if not current_cdf.empty:
            cleaned.append(current_cdf)
    if not cleaned:
        return pd.DataFrame(columns=df.columns)
    return pd.concat(cleaned, ignore_index=True)

def verify_cycles(df, config):
    bad = df.groupby('cycle_index')['test_time_s'].apply(lambda x: not x.is_monotonic_increasing)
    bad_cycles = bad[bad].index.tolist()
    if bad_cycles:
        if config['drop_bad_cycles']:
            df = df[~df['cycle_index'].isin(bad_cycles)]
        else:
            raise ValueError(f"Non-monotonic time in cycles: {bad_cycles}")
    return df

# Main processing loop
processed_files_count = 0
csv_files = list(raw_dir.rglob("*.csv"))
print(f"Found {len(csv_files)} CSV files in dataset.")

for csv_file in csv_files:
    fname = csv_file.name
    print(f"Processing {fname}...")
    try:
        df = pd.read_csv(csv_file)
        print(f"  Shape before: {df.shape}")

        # Basic cleaning for all files
        df = standardize_columns(df)
        df = remove_empty_columns(df)
        df = enforce_numeric(df)
        df = drop_duplicates_early(df)

        is_time_series = 'timeseries' in fname.lower()
        is_cycle_data = 'cycledata' in fname.lower()

        if is_time_series:
            # Full cleaning pipeline for timeseries
            validate_columns(df)
            print(f"  Shape after validate_columns: {df.shape}")
            df = handle_missing(df, CONFIG)
            print(f"  Shape after handle_missing: {df.shape}")
            df = enforce_physics(df, CONFIG)
            print(f"  Shape after enforce_physics: {df.shape}")
            df = remove_outliers_cycle_wise(df)
            print(f"  Shape after remove_outliers_cycle_wise: {df.shape}")
            df = verify_cycles(df, CONFIG)
            print(f"  Shape after verify_cycles: {df.shape}")
            df = df.drop_duplicates()
            print("  Full cleaning applied for timeseries file")
        elif is_cycle_data:
            # Basic cleaning only for cycledata
            if 'cycle_index' not in df.columns:
                raise ValueError("Missing cycle_index for cycle data file, cannot process.")
            print("  Basic cleaning applied for cycledata file, skipping advanced steps")
        else:
            # Attempt full cleaning for unknown types
            try:
                validate_columns(df)
                print(f"  Shape after validate_columns (unknown): {df.shape}")
                df = handle_missing(df, CONFIG)
                print(f"  Shape after handle_missing (unknown): {df.shape}")
                df = enforce_physics(df, CONFIG)
                print(f"  Shape after enforce_physics (unknown): {df.shape}")
                df = remove_outliers_cycle_wise(df)
                print(f"  Shape after remove_outliers_cycle_wise (unknown): {df.shape}")
                df = verify_cycles(df, CONFIG)
                print(f"  Shape after verify_cycles (unknown): {df.shape}")
                df = df.drop_duplicates()
                print("  Attempted full cleaning for unknown file type")
            except:
                print("  Skipped advanced cleaning due to errors")

        if df.empty:
            print(f"  Warning: Cleaned DataFrame for {fname} is empty. Skipping save.")
            continue

        print(f"  Shape after: {df.shape}")
        out_path = clean_dir / f"{fname.replace('.csv', '_cleaned.csv')}"
        df.to_csv(out_path, index=False)
        processed_files_count += 1
        print(f"  Saved: {out_path.name}")

    except ValueError as ve:
        print(f"  Failed on {fname} due to validation/processing error: {ve}")
    except Exception as e:
        print(f"  Failed on {fname}: {e}")

print(f"Cleaning complete. Processed {processed_files_count} CSV files.")

# Create ZIP
print(f"Zipping cleaned files to {zip_filename}...")
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file_path in clean_dir.rglob("*"):
        if file_path.is_file():
            arcname = file_path.relative_to(clean_dir)
            zipf.write(file_path, arcname)
print(f"All cleaned files zipped to {zip_filename}")
print("Zip saved to local storage. Download or use as needed.")

Using Colab cache for faster access to the 'calce-dataset-battery-archive' dataset.
Dataset downloaded to: /kaggle/input/calce-dataset-battery-archive
Found 14 CSV files in dataset.
Processing CALCE_CX2-36_prism_LCO_25C_0-100_0.5-0.5C_f_cycle_data.csv...
  Shape before: (1971, 12)
  Skipped advanced cleaning due to errors
  Shape after: (1971, 10)
  Saved: CALCE_CX2-36_prism_LCO_25C_0-100_0.5-0.5C_f_cycle_data_cleaned.csv
Processing CALCE_CX2-25_prism_LCO_25C_0-100_0.5-0.5C_b_cycle_data.csv...
  Shape before: (1815, 12)
  Skipped advanced cleaning due to errors
  Shape after: (1815, 10)
  Saved: CALCE_CX2-25_prism_LCO_25C_0-100_0.5-0.5C_b_cycle_data_cleaned.csv
Processing CALCE_CX2-37_prism_LCO_25C_0-100_0.5-0.5C_g_timeseries.csv...
  Shape before: (376992, 11)
  Shape after validate_columns: (376992, 9)
  Shape after handle_missing: (376992, 9)
  Shape after enforce_physics: (376992, 9)
  Shape after remove_outliers_cycle_wise: (339493, 9)
  Shape after verify_cycles: (339493, 9)
  Fu

In [5]:
from google.colab import files

zip_filename = "cleaned_battery_data.zip"
if os.path.exists(zip_filename):
  files.download(zip_filename)
else:
  print(f"Error: {zip_filename} not found. Please ensure the cleaning and zipping process has completed successfully.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>