<a href="https://colab.research.google.com/github/sidharthdk/BMS-Data-pre-processing-files/blob/main/Oford_dataset_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# === OXFORD BATTERY DATASET - LOCAL ZIP CLEAN & SAVE ===
# Upload your Oxford battery dataset ZIP to Colab Files panel, then RUN

from google.colab import files
import zipfile
import pandas as pd
import numpy as np
from pathlib import Path

# Step 1: Upload ZIP
print("ðŸ“¤ Upload your Oxford battery ZIP file:")
uploaded = files.upload()
zip_path = list(uploaded.keys())[0]
print(f"Uploaded: {zip_path}")

# Step 2: Extract
temp_dir = Path("/content/oxford_raw")
temp_dir.mkdir(exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(temp_dir)
print(f"âœ… Extracted to {temp_dir}")

# Step 3: Oxford battery config (Kokam pouch cells, NMC, aging dataset)
# Oxford dataset typically: 740mAh cells, 2.7-4.2V, varied temps/currents
CONFIG = {
    'interp_gap': 10,
    'v_min': 2.7,      # Oxford NMC lower cutoff
    'v_max': 4.2,      # Oxford NMC upper cutoff
    'capacity_ah': 0.74,  # 740mAh nominal
    'drop_bad_cycles': True
}

# Step 4: Find all CSVs (or MAT files - converted)
clean_dir = Path("/content/oxford_cleaned")
clean_dir.mkdir(exist_ok=True)

csv_files = list(temp_dir.rglob("*.csv"))
mat_files = list(temp_dir.rglob("*.mat"))
print(f"Found: {len(csv_files)} CSV, {len(mat_files)} MAT files")

# Handle MAT files (Oxford often uses MATLAB format)
if mat_files and not csv_files:
    print("Converting MAT files...")
    try:
        from scipy.io import loadmat
        for mf in mat_files:
            mat = loadmat(mf, simplify_cells=True)
            for key in mat:
                if isinstance(mat[key], np.ndarray) and mat[key].ndim == 2:
                    df = pd.DataFrame(mat[key])
                    csv_path = temp_dir / f"{mf.stem}_{key}.csv"
                    df.to_csv(csv_path, index=False)
        csv_files = list(temp_dir.rglob("*.csv"))
        print(f"Converted: {len(csv_files)} CSVs")
    except ImportError:
        print("Install scipy: !pip install scipy")

# Step 5: Clean each CSV
success = 0
for i, fpath in enumerate(csv_files):
    print(f"\n[{i+1}/{len(csv_files)}] {fpath.name}")
    try:
        df = pd.read_csv(fpath)
        orig_len = len(df)

        # Standardize columns (Oxford uses varied naming)
        df.columns = (df.columns.str.strip().str.lower()
                      .str.replace(r'[^a-z0-9_]', '', regex=True)
                      .str.replace(r'_+', '_', regex=True))

        # Oxford column mapping (common variants)
        col_map = {
            'v': 'voltage_v', 'voltage': 'voltage_v', 'volt': 'voltage_v',
            'i': 'current_a', 'current': 'current_a', 'curr': 'current_a',
            't': 'time_s', 'time': 'time_s', 'test_time': 'time_s',
            'temp': 'temperature_c', 'temperature': 'temperature_c',
            'ah': 'capacity_ah', 'capacity': 'capacity_ah',
            'cycle': 'cycle_number', 'cyc': 'cycle_number'
        }
        df = df.rename(columns={k: v for k, v in col_map.items() if k in df.columns})

        # Numeric conversion
        df = df.apply(pd.to_numeric, errors='coerce')
        df = df.dropna(axis=1, how='all').dropna(how='all')
        df = df.drop_duplicates()

        # Interpolation
        num_cols = df.select_dtypes(np.number).columns.tolist()
        if len(num_cols) > 0 and len(df) > 1:
            time_col = next((c for c in ['time_s', 'test_time_s', 'time'] if c in df), None)
            if time_col:
                df = df.sort_values(time_col)
            df[num_cols] = df[num_cols].interpolate('linear', limit=CONFIG['interp_gap'])
            df[num_cols] = df[num_cols].ffill().bfill()
            df = df.dropna()

        # Oxford physics bounds
        if 'voltage_v' in df.columns:
            df = df[(df['voltage_v'] >= CONFIG['v_min']) & (df['voltage_v'] <= CONFIG['v_max'])]

        # Capacity sanity (0 to 2x nominal)
        if 'capacity_ah' in df.columns:
            df = df[(df['capacity_ah'] >= 0) & (df['capacity_ah'] <= CONFIG['capacity_ah'] * 2)]

        # Non-negative constraints
        for col in num_cols:
            if col in df and 'cycle' not in col.lower() and 'current' not in col.lower():
                df[col] = df[col].clip(lower=0)

        # Monotonic time per cycle
        if 'cycle_number' in df.columns and 'time_s' in df.columns and CONFIG['drop_bad_cycles']:
            valid = df.groupby('cycle_number')['time_s'].transform(lambda x: x.is_monotonic_increasing)
            df = df[valid]

        # Save if meaningful
        if len(df) >= 10:
            out_path = clean_dir / f"{fpath.stem}_cleaned.csv"
            df.to_csv(out_path, index=False)
            pct = len(df) / orig_len * 100 if orig_len > 0 else 0
            print(f"  âœ“ {len(df):,} rows ({pct:.0f}% kept)")
            success += 1
        else:
            print(f"  âš  Skipped ({len(df)} rows)")

    except Exception as e:
        print(f"  âœ— {type(e).__name__}: {str(e)[:60]}")

# Step 6: Create final ZIP
final_zip = "/content/oxford_battery_cleaned.zip"
with zipfile.ZipFile(final_zip, 'w', zipfile.ZIP_DEFLATED) as zf:
    for f in clean_dir.glob("*.csv"):
        zf.write(f, f.name)

# Results
print(f"\n{'='*50}")
print(f"ðŸŽ‰ OXFORD CLEANING COMPLETE!")
print(f"âœ… Success: {success}/{len(csv_files)} files")
!du -sh {final_zip} {clean_dir}
print(f"\nðŸ“¥ Download: {final_zip}")

# Quality summary
if list(clean_dir.glob("*.csv")):
    sample_file = list(clean_dir.glob("*.csv"))[0]
    sample = pd.read_csv(sample_file)
    print(f"\nðŸ“Š Sample ({sample_file.name}):")
    print(f"   Columns: {list(sample.columns)}")
    print(f"   Shape: {sample.shape}")
    print(f"   Nulls: {sample.isnull().sum().sum()}")
    if 'voltage_v' in sample:
        print(f"   Voltage range: {sample['voltage_v'].min():.2f} - {sample['voltage_v'].max():.2f} V")


ðŸ“¤ Upload your Oxford battery ZIP file:


Saving Oxford-20260121T132139Z-1-001.zip to Oxford-20260121T132139Z-1-001.zip
Uploaded: Oxford-20260121T132139Z-1-001.zip
âœ… Extracted to /content/oxford_raw
Found: 16 CSV, 0 MAT files

[1/16] OX_1-6_pouch_LCO_40C_0-100_2-1.84C_f_cycle_data.csv
  âœ“ 46 rows (100% kept)

[2/16] OX_1-8_pouch_LCO_40C_0-100_2-1.84C_h_cycle_data.csv
  âœ“ 76 rows (100% kept)

[3/16] OX_1-3_pouch_LCO_40C_0-100_2-1.84C_c_timeseries.csv
  âœ“ 452,258 rows (100% kept)

[4/16] OX_1-4_pouch_LCO_40C_0-100_2-1.84C_d_timeseries.csv
  âœ“ 286,707 rows (100% kept)

[5/16] OX_1-7_pouch_LCO_40C_0-100_2-1.84C_g_cycle_data.csv
  âœ“ 77 rows (100% kept)

[6/16] OX_1-5_pouch_LCO_40C_0-100_2-1.84C_e_timeseries.csv
  âœ“ 285,428 rows (100% kept)

[7/16] OX_1-2_pouch_LCO_40C_0-100_2-1.84C_b_timeseries.csv
  âœ“ 424,380 rows (100% kept)

[8/16] OX_1-1_pouch_LCO_40C_0-100_2-1.84C_a_timeseries.csv
  âœ“ 460,381 rows (100% kept)

[9/16] OX_1-7_pouch_LCO_40C_0-100_2-1.84C_g_timeseries.csv
  âœ“ 465,391 rows (100% kept)

[10/16] O