In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import yaml

BASE_DIR = Path("..")
DATA_DIR = BASE_DIR / "data" / "raw"
PYBAMM_DIR = DATA_DIR / "final_pybamm_output"
OUTPUT_PARAM_DIR = DATA_DIR / "output_parameter_sweep"
CONFIG_DIR = BASE_DIR / "configs"

with open(CONFIG_DIR / "optimise_config.yml", "r") as f:
    config = yaml.safe_load(f)

TOTAL_SAMPLES = config["data"]["total_samples"]
PARAMS_PER_SAMPLE = config["data"]["params_per_sample"]
INPUT_FEATURES = config["data"]["input_features"]

print("=" * 80)
print("üîç Analyzing Parquet File Issues")
print("=" * 80)
print("\nConfiguration:")
print(f"   Total samples: {TOTAL_SAMPLES}")
print(f"   Params per sample: {PARAMS_PER_SAMPLE}")
print(f"   Input features: {len(INPUT_FEATURES)}")
print("=" * 80)

üîç Analyzing Parquet File Issues

Configuration:
   Total samples: 100
   Params per sample: 60
   Input features: 10


In [2]:
print("\nüìÇ Loading tau_results.csv...")
tau_results = pd.read_csv(OUTPUT_PARAM_DIR / "taufactor_results.csv")

print(f"‚úì Loaded {len(tau_results)} rows")
print(f"\nColumns: {list(tau_results.columns)}")
print(f"\nSample IDs in tau_results: {sorted(tau_results['id'].unique())[:20]}...")

expected_samples = set(range(TOTAL_SAMPLES))
actual_samples = set(tau_results["id"].unique())
missing_in_tau = expected_samples - actual_samples

if missing_in_tau:
    print(f"\n‚ö†Ô∏è  {len(missing_in_tau)} samples missing from tau_results:")
    print(f"   {sorted(missing_in_tau)}")
else:
    print(f"\n‚úÖ All {TOTAL_SAMPLES} samples present in tau_results")

print("\n" + "=" * 80)


üìÇ Loading tau_results.csv...
‚úì Loaded 100 rows

Columns: ['id', 'filename', 'porosity_measured', 'tau_factor', 'D_eff', 'error']

Sample IDs in tau_results: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19)]...

‚úÖ All 100 samples present in tau_results



In [3]:
print("\nüîç Analyzing all sample+param combinations...")

failures = {
    "tau_missing": [],
    "parquet_missing": [],
    "param_id_missing": [],
    "input_features_missing": [],
    "bruggeman_missing": [],
    "capacity_missing": [],
    "success": [],
}

detailed_failures = []

for sample_id in range(TOTAL_SAMPLES):
    tau_rows = tau_results[tau_results["id"] == sample_id]
    has_tau = len(tau_rows) > 0

    parquet_file = PYBAMM_DIR / f"results_rank_{sample_id}.parquet"
    has_parquet = parquet_file.exists()

    if not has_parquet:
        for param_id in range(PARAMS_PER_SAMPLE):
            failures["parquet_missing"].append((sample_id, param_id))
        continue

    parquet_df = pd.read_parquet(parquet_file)

    for param_id in range(PARAMS_PER_SAMPLE):
        param_rows = parquet_df[parquet_df["param_id"] == param_id]

        if len(param_rows) == 0:
            failures["param_id_missing"].append((sample_id, param_id))
            detailed_failures.append(
                {
                    "sample_id": sample_id,
                    "param_id": param_id,
                    "reason": "param_id not found in parquet",
                    "details": f"Available param_ids: {sorted(parquet_df['param_id'].unique())}",
                }
            )
            continue

        if not has_tau:
            failures["tau_missing"].append((sample_id, param_id))
            detailed_failures.append(
                {
                    "sample_id": sample_id,
                    "param_id": param_id,
                    "reason": "sample_id not in tau_results",
                    "details": "",
                }
            )
            continue

        param_row = param_rows.iloc[0]

        missing_inputs = [
            feat
            for feat in INPUT_FEATURES
            if feat not in param_row.index or pd.isna(param_row[feat])
        ]

        if missing_inputs:
            failures["input_features_missing"].append((sample_id, param_id))
            detailed_failures.append(
                {
                    "sample_id": sample_id,
                    "param_id": param_id,
                    "reason": "missing input features",
                    "details": (
                        f"Missing: {missing_inputs[:3]}..."
                        if len(missing_inputs) > 3
                        else f"Missing: {missing_inputs}"
                    ),
                }
            )
            continue

        if "bruggeman_derived" not in param_row.index or pd.isna(
            param_row["bruggeman_derived"]
        ):
            failures["bruggeman_missing"].append((sample_id, param_id))
            detailed_failures.append(
                {
                    "sample_id": sample_id,
                    "param_id": param_id,
                    "reason": "bruggeman_derived missing or NaN",
                    "details": f"Value: {param_row.get('bruggeman_derived', 'NOT FOUND')}",
                }
            )
            continue

        if "capacity_trend_ah" not in param_row.index:
            failures["capacity_missing"].append((sample_id, param_id))
            detailed_failures.append(
                {
                    "sample_id": sample_id,
                    "param_id": param_id,
                    "reason": "capacity_trend_ah missing",
                    "details": f"Available columns: {list(param_row.index)[:10]}...",
                }
            )
            continue

        failures["success"].append((sample_id, param_id))


üîç Analyzing all sample+param combinations...


In [4]:
print("\n" + "=" * 80)
print("üìä FAILURE SUMMARY")
print("=" * 80)

total_expected = TOTAL_SAMPLES * PARAMS_PER_SAMPLE

print(f"\nTotal expected: {total_expected}")
print(f"Successful: {len(failures['success'])}")
print(f"Failed: {total_expected - len(failures['success'])}")

print("\nFailure breakdown:")
print(f"   Tau results missing: {len(failures['tau_missing'])}")
print(f"   Parquet file missing: {len(failures['parquet_missing'])}")
print(f"   Param ID not found: {len(failures['param_id_missing'])}")
print(f"   Input features missing: {len(failures['input_features_missing'])}")
print(f"   Bruggeman missing: {len(failures['bruggeman_missing'])}")
print(f"   Capacity trend missing: {len(failures['capacity_missing'])}")

print("\n" + "=" * 80)


üìä FAILURE SUMMARY

Total expected: 6000
Successful: 6000
Failed: 0

Failure breakdown:
   Tau results missing: 0
   Parquet file missing: 0
   Param ID not found: 0
   Input features missing: 0
   Bruggeman missing: 0
   Capacity trend missing: 0



In [5]:
print("\nüîç DETAILED FAILURE EXAMPLES (first 20)")
print("=" * 80)

df_failures = pd.DataFrame(detailed_failures)

if len(df_failures) > 0:
    print(f"\nTotal failures: {len(df_failures)}")

    print("\nFailures by reason:")
    for reason, count in df_failures["reason"].value_counts().items():
        print(f"   {reason}: {count}")

    print("\n" + "-" * 80)
    print("First 20 failures:")
    print("-" * 80)

    for _, row in df_failures.head(20).iterrows():
        print(f"\nSample {row['sample_id']:3d}, Param {row['param_id']:2d}:")
        print(f"   Reason: {row['reason']}")
        if row["details"]:
            print(f"   Details: {row['details']}")

    output_file = BASE_DIR / "data" / "parquet_failures.csv"
    df_failures.to_csv(output_file, index=False)
    print(f"\nüíæ Full failure report saved to: {output_file}")
else:
    print("\n‚úÖ No failures detected!")

print("\n" + "=" * 80)


üîç DETAILED FAILURE EXAMPLES (first 20)

‚úÖ No failures detected!



In [6]:
if len(detailed_failures) > 0:
    failed_sample = detailed_failures[0]
    SAMPLE_ID = failed_sample["sample_id"]
    PARAM_ID = failed_sample["param_id"]

    print(f"\nüîç DETAILED INSPECTION: Sample {SAMPLE_ID}, Param {PARAM_ID}")
    print("=" * 80)

    print("\n1. TAU RESULTS:")
    tau_row = tau_results[tau_results["id"] == SAMPLE_ID]
    if len(tau_row) > 0:
        print("   ‚úÖ Found in tau_results")
        print(f"   Columns: {list(tau_row.columns)}")
        print(f"   D_eff: {tau_row.iloc[0].get('D_eff', 'MISSING')}")
        print(
            f"   porosity_measured: {tau_row.iloc[0].get('porosity_measured', 'MISSING')}"
        )
        print(f"   tau_factor: {tau_row.iloc[0].get('tau_factor', 'MISSING')}")
    else:
        print("   ‚ùå NOT found in tau_results")

    print("\n2. PARQUET FILE:")
    parquet_file = PYBAMM_DIR / f"results_rank_{SAMPLE_ID}.parquet"
    if parquet_file.exists():
        df = pd.read_parquet(parquet_file)
        print(f"   ‚úÖ File exists")
        print(f"   Total rows: {len(df)}")
        print(f"   Param IDs: {sorted(df['param_id'].unique())}")
    else:
        print("   ‚ùå File does not exist")

    print("\n" + "=" * 80)

In [7]:
if len(df_failures) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    sample_counts = df_failures["sample_id"].value_counts().sort_index()
    axes[0, 0].bar(sample_counts.index, sample_counts.values)
    axes[0, 0].set_title("Failures by Sample")
    axes[0, 0].grid(alpha=0.3)

    param_counts = df_failures["param_id"].value_counts().sort_index()
    axes[0, 1].bar(param_counts.index, param_counts.values)
    axes[0, 1].set_title("Failures by Param ID")
    axes[0, 1].grid(alpha=0.3)

    reason_counts = df_failures["reason"].value_counts()
    axes[1, 0].barh(reason_counts.index, reason_counts.values)
    axes[1, 0].set_title("Failures by Reason")
    axes[1, 0].grid(alpha=0.3, axis="x")

    failure_matrix = np.zeros((TOTAL_SAMPLES, PARAMS_PER_SAMPLE))
    for _, r in df_failures.iterrows():
        failure_matrix[r["sample_id"], r["param_id"]] = 1

    im = axes[1, 1].imshow(
        failure_matrix.T,
        aspect="auto",
        cmap="RdYlGn_r",
        interpolation="nearest",
    )
    plt.colorbar(im, ax=axes[1, 1])
    axes[1, 1].set_title("Failure Heatmap")

    plt.tight_layout()
    plt.show()

    print("\n‚úÖ Visualization complete")
else:
    print("\n‚úÖ No failures to visualize!")


‚úÖ No failures to visualize!


In [8]:
print("\nüîç Checking input feature availability across all parquet files...")
print("=" * 80)

feature_availability = {
    feat: {"present": 0, "missing": 0, "nan": 0} for feat in INPUT_FEATURES
}
feature_availability["bruggeman_derived"] = {"present": 0, "missing": 0, "nan": 0}
feature_availability["capacity_trend_ah"] = {"present": 0, "missing": 0, "nan": 0}

total_checked = 0

for sample_id in range(TOTAL_SAMPLES):
    parquet_file = PYBAMM_DIR / f"results_rank_{sample_id}.parquet"
    if not parquet_file.exists():
        continue

    df = pd.read_parquet(parquet_file)

    for _, row in df.iterrows():
        total_checked += 1

        for feat in INPUT_FEATURES:
            if feat in row.index:
                if pd.isna(row[feat]):
                    feature_availability[feat]["nan"] += 1
                else:
                    feature_availability[feat]["present"] += 1
            else:
                feature_availability[feat]["missing"] += 1

        if "bruggeman_derived" in row.index:
            if pd.isna(row["bruggeman_derived"]):
                feature_availability["bruggeman_derived"]["nan"] += 1
            else:
                feature_availability["bruggeman_derived"]["present"] += 1
        else:
            feature_availability["bruggeman_derived"]["missing"] += 1

        if "capacity_trend_ah" in row.index:
            feature_availability["capacity_trend_ah"]["present"] += 1
        else:
            feature_availability["capacity_trend_ah"]["missing"] += 1

print(f"\nTotal rows checked: {total_checked}")
print("\nFeature availability:")
print("-" * 80)

for feat, stats in feature_availability.items():
    print(f"\n{feat}:")
    print(f"   Present: {stats['present']} ({stats['present']/total_checked*100:.1f}%)")
    print(f"   NaN: {stats['nan']} ({stats['nan']/total_checked*100:.1f}%)")
    print(f"   Missing: {stats['missing']} ({stats['missing']/total_checked*100:.1f}%)")

print("\n" + "=" * 80)
print("‚úÖ Feature availability check complete")
print("=" * 80)


üîç Checking input feature availability across all parquet files...

Total rows checked: 6000

Feature availability:
--------------------------------------------------------------------------------

input_SEI kinetic rate constant [m.s-1]:
   Present: 6000 (100.0%)
   NaN: 0 (0.0%)
   Missing: 0 (0.0%)

input_Electrolyte diffusivity [m2.s-1]:
   Present: 6000 (100.0%)
   NaN: 0 (0.0%)
   Missing: 0 (0.0%)

input_Initial concentration in electrolyte [mol.m-3]:
   Present: 6000 (100.0%)
   NaN: 0 (0.0%)
   Missing: 0 (0.0%)

input_Separator porosity:
   Present: 6000 (100.0%)
   NaN: 0 (0.0%)
   Missing: 0 (0.0%)

input_Separator Bruggeman coefficient (electrolyte):
   Present: 6000 (100.0%)
   NaN: 0 (0.0%)
   Missing: 0 (0.0%)

input_Separator Bruggeman coefficient:
   Present: 6000 (100.0%)
   NaN: 0 (0.0%)
   Missing: 0 (0.0%)

input_Positive particle radius [m]:
   Present: 6000 (100.0%)
   NaN: 0 (0.0%)
   Missing: 0 (0.0%)

input_Negative particle radius [m]:
   Present: 6000 (1