<a href="https://colab.research.google.com/github/sidharthdk/BMS-Data-pre-processing-files/blob/main/Michigan_Expansion_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Paste this complete code block into a Colab cell (run sequentially if split)
# Step 1: Install & setup Kaggle API
!pip install -q kagglehub

# Step 2: Login (run this cell, enter username/token from kaggle.com/account)
import kagglehub
kagglehub.login()

# Step 3: Download dataset (using hash/ID - works for private/public)
# Note: Hash identifies specific version; downloads to ~/.cache/kagglehub/...
# TODO: Replace the placeholder hash with the actual 'owner/dataset-name' for the dataset you want to download.
# For example: dataset_path = kagglehub.dataset_download("kaggle/titanic")
dataset_path = kagglehub.dataset_download("sidharthdk/michigan-expansion-battery-dataset")
print(f"Dataset path: {dataset_path}")

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggleâ€¦

Downloading from https://www.kaggle.com/api/v1/datasets/download/sidharthdk/michigan-expansion-battery-dataset?dataset_version_number=1...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 83.9M/83.9M [00:00<00:00, 160MB/s]

Extracting files...





Dataset path: /root/.cache/kagglehub/datasets/sidharthdk/michigan-expansion-battery-dataset/versions/1
Kaggle credentials set.


In [11]:
# ULTRA-SIMPLE FULL PIPELINE - SINGLE PASTE, NO FUNCTIONS/MISSING VARS
# Copy-paste this ENTIRE block into a NEW Colab cell & RUN

# Reset
!rm -rf /content/cleaned /content/cleaned_*.zip
import pandas as pd
import numpy as np
import zipfile
from pathlib import Path
clean_dir = Path("/content/cleaned")
clean_dir.mkdir(exist_ok=True)
zip_fn = "/content/cleaned_mich.zip"

# Assume raw_dir from prior (adjust if needed)
raw_dir = Path("/root/.cache/kagglehub/datasets/cdb0f2242f9302d1c50857eb0fed57a01919e19d65f163fc8b56c5e4d7db8865/versions/1")
CONFIG = {'gap':5, 'vmin':2.7, 'vmax':4.2}

csvs = list(raw_dir.rglob("*.csv"))
print(f"Cleaning {len(csvs)} files...")

for i, fpath in enumerate(csvs):
    print(f"{i+1}/{len(csvs)} {fpath.name}")
    try:
        df = pd.read_csv(fpath)
        print(f"  Shape: {df.shape}")

        # INLINE CLEANING - no functions!
        df.columns = df.columns.str.lower().str.replace(r'[^a-z0-9_]', '', regex=True)
        df = df.apply(pd.to_numeric, errors='coerce')
        df = df.dropna(axis=1, how='all').dropna(how='all')
        df = df.drop_duplicates()

        # Interpolate
        numcols = df.select_dtypes(np.number).columns
        if len(numcols)>0 and len(df)>1:
            df[numcols] = df[numcols].interpolate(limit=CONFIG['gap']).ffill().bfill()
            df = df.dropna()

        # Bounds (skip if no voltage)
        if 'voltage_v' in df:
            df = df[(df['voltage_v']>=CONFIG['vmin']) & (df['voltage_v']<=CONFIG['vmax'])]
        for c in numcols:
            if c not in df.columns: continue
            df[c] = df[c].clip(lower=0)

        # Save if meaningful
        if len(df) > 5:
            outf = clean_dir / f"{fpath.stem}_cleaned.csv"
            df.to_csv(outf, index=False)
            print(f"  Saved: {len(df)} rows ({len(df)/df.shape[0]*100:.0f}% kept)")
        else:
            print("  Skipped: too small")

    except Exception as e:
        print(f"  FAILED: {e}")

# ZIP & done
with zipfile.ZipFile(zip_fn, 'w', zipfile.ZIP_DEFLATED) as z:
    for f in clean_dir.glob('*.csv'):
        z.write(f, f.name)

print(f"\nðŸŽ‰ FINISHED! Download: {zip_fn}")
!ls -la {clean_dir} | wc -l   # File count
!du -sh {zip_fn}              # Size


Cleaning 0 files...

ðŸŽ‰ FINISHED! Download: /content/cleaned_mich.zip
3
4.0K	/content/cleaned_mich.zip


In [12]:
# PERFECT FINAL CODE - Uses EXACT dataset slug for Michigan NMC pouch data
# Paste & RUN in Colab - Downloads sidharthdk/michigan-expansion-battery-dataset + cleans 36 CSVs

!pip install -q kagglehub
import kagglehub
kagglehub.login()  # Username/token popup

# Download Michigan dataset
ds_path = kagglehub.dataset_download("sidharthdk/michigan-expansion-battery-dataset")
raw_dir = Path(ds_path)
print(f"âœ… Michigan dataset: {raw_dir}")
print(f"CSVs: {len(list(raw_dir.rglob('*.csv')))} files")

import pandas as pd
import numpy as np
import zipfile
from pathlib import Path

clean_dir = Path("/content/mich_cleaned")
clean_dir.mkdir(exist_ok=True)
zip_fn = "/content/michigan_cleaned.zip"

CONFIG = {'interp_gap': 10, 'v_min': 2.5, 'v_max': 4.3}  # NMC pouch safe

csvs = list(raw_dir.rglob("*.csv"))
success_count = 0

for i, fpath in enumerate(csvs):
    print(f"\n[{i+1}/36] {fpath.name}")
    try:
        df = pd.read_csv(fpath)
        orig_rows = len(df)

        # Robust cleaning
        df.columns = df.columns.str.strip().str.lower().str.replace(r'[^a-z0-9_ ]', '', regex=True)
        df = df.apply(pd.to_numeric, errors='coerce')
        df = df.dropna(axis=1, how='all').dropna(how='all')
        df = df.drop_duplicates()

        # Interpolate gaps
        num_cols = df.select_dtypes(np.number).columns
        if len(num_cols) > 0 and len(df) > 1:
            time_col = next((col for col in ['test_time_s', 'time_s', 'time'] if col in df), None)
            if time_col:
                df = df.sort_values(time_col)
            df[num_cols] = df[num_cols].interpolate(method='linear', limit=CONFIG['interp_gap'])
            df[num_cols] = df[num_cols].ffill().bfill()
            df = df.dropna()

        # NMC physics
        if 'voltage_v' in df:
            df = df[(df['voltage_v'] >= CONFIG['v_min']) & (df['voltage_v'] <= CONFIG['v_max'])]
        for col in num_cols:
            if col in df and col not in ['cycle', 'cycle_number', 'cycle_index']:
                df[col] = df[col].clip(lower=0)

        if len(df) >= 10:
            out_file = clean_dir / f"{fpath.stem}_cleaned.csv"
            df.to_csv(out_file, index=False)
            pct_keep = len(df) / orig_rows * 100
            print(f"  âœ“ Saved {len(df):,} rows ({pct_keep:.0f}% kept)")
            success_count += 1
        else:
            print(f"  âš  Skipped (only {len(df)} rows)")

    except Exception as e:
        print(f"  âœ— {type(e).__name__}: {str(e)[:60]}")

# Final ZIP
with zipfile.ZipFile(zip_fn, 'w', zipfile.ZIP_DEFLATED) as zf:
    for csv_file in clean_dir.glob("*.csv"):
        zf.write(csv_file, csv_file.name)

print(f"\nðŸŽ‰ SUCCESS: {success_count}/36 files cleaned!")
print(f"ðŸ“¦ Download: {zip_fn}")
!du -sh {zip_fn} {clean_dir}
!head -5 {clean_dir.glob('*cycle_data_cleaned.csv')[0]}  # Sample


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggleâ€¦

Using Colab cache for faster access to the 'michigan-expansion-battery-dataset' dataset.
âœ… Michigan dataset: /kaggle/input/michigan-expansion-battery-dataset
CSVs: 36 files

[1/36] MICH_13R_pouch_NMC_25C_50-100_0.2-0.2C_timeseries_data.csv
  âœ“ Saved 999,468 rows (100% kept)

[2/36] MICH_06H_pouch_NMC_45C_0-100_1.5-1.5C_timeseries_data.csv
  âœ“ Saved 89,936 rows (100% kept)

[3/36] MICH_16R_pouch_NMC_25C_50-100_0.2-1.5C_timeseries_data.csv
  âœ“ Saved 581,688 rows (100% kept)

[4/36] MICH_14C_pouch_NMC_-5C_50-100_0.2-0.2C_cycle_data.csv
  âœ“ Saved 452 rows (100% kept)

[5/36] MICH_08C_pouch_NMC_-5C_0-100_2-2C_timeseries_data.csv
  âœ“ Saved 325,090 rows (100% kept)

[6/36] MICH_12H_pouch_NMC_45C_0-100_0.2-1.5C_timeseries_data.csv
  âœ“ Saved 312,225 rows (100% kept)

[7/36] MICH_08C_pouch_NMC_-5C_0-100_2-2C_cycle_data.csv
  âœ“ Saved 573 rows (100% kept)

[8/36] MICH_01R_pouch_NMC_25C_0-100_0.2-0.2C_cycle_data.csv
  âœ“ Saved 379 rows (100% kept)

[9/36] MICH_03H_pouch_NMC_45C_0-1

NameError: name 'content' is not defined

In [14]:
import zipfile
from pathlib import Path

folder_to_zip = Path('/content/mich_cleaned')
output_zip_file = '/content/mich_cleaned.zip'

with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as zf:
    for file_path in folder_to_zip.rglob('*'):
        if file_path.is_file():
            zf.write(file_path, arcname=file_path.relative_to(folder_to_zip))

print(f"Successfully zipped '{folder_to_zip}' to '{output_zip_file}'")
!du -sh {output_zip_file}

Successfully zipped '/content/mich_cleaned' to '/content/mich_cleaned.zip'
86M	/content/mich_cleaned.zip
