# Battery Data Analysis & Validation

This notebook performs batch analysis on the standardized battery data. 

**Objectives:**
1.  **Validate Fields:** Ensure all files contain the required fields requested by the user.
2.  **Visual Inspection:** Generate plots for Voltage, Current, and Capacity to distinguish between datasets.
3.  **Data Quality:** Identify inconsistencies or data gaps.

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List

# Ensure src is in path
sys.path.append(os.path.abspath('../../'))

from src.access.data_loader import DataLoader

# Initialize DataLoader
loader = DataLoader(data_dir="../../data/standardized/")

## 1. Load Data Index

In [None]:
index = loader.load_index()
print(f"Total Files Processed: {len(index)}")
display(index.head())

## 2. Field Validation

We verify that every file contains the exact list of fields requested:
`Data_Point, Test_Time, Date_Time, Step_Time, Step_Index, Cycle_Index, Current, Voltage, Charge_Capacity, Discharge_Capacity, Charge_Energy, Discharge_Energy, dV/dt, Internal_Resistance, Is_FC_Data, AC_Impedance, ACI_Phase_Angle`

In [None]:
REQUIRED_FIELDS = [
    'Data_Point', 'Test_Time', 'Date_Time', 'Step_Time',
    'Step_Index', 'Cycle_Index', 'Current', 'Voltage',
    'Charge_Capacity', 'Discharge_Capacity', 'Charge_Energy', 'Discharge_Energy',
    'dV/dt', 'Internal_Resistance', 'Is_FC_Data', 'AC_Impedance', 'ACI_Phase_Angle'
]

files = loader.get_files()
validation_results = []

for file in files:
    df = loader.load_data(file)
    file_name = os.path.basename(file)
    
    missing = [f for f in REQUIRED_FIELDS if f not in df.columns]
    
    status = "PASS" if not missing else "FAIL"
    validation_results.append({
        'file': file_name,
        'status': status,
        'missing_fields': missing,
        'total_rows': len(df)
    })

validation_df = pd.DataFrame(validation_results)
print("Validation Summary:")
display(validation_df['status'].value_counts())

# Show failures if any
failures = validation_df[validation_df['status'] == 'FAIL']
if not failures.empty:
    print("\nfailed Files:")
    display(failures)

## 3. Visualization

We will plot key metrics for a subset of files to inspect data quality and differences.

In [None]:
# Plot setup
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 10)

# Select top 5 largest files for separate detailed plotting
top_files = validation_df.sort_values('total_rows', ascending=False).head(5)['file'].tolist()

def plot_file_data(file_name):
    df = loader.load_data(file_name)
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f"Analysis for {file_name}", fontsize=16)
    
    # Voltage vs Time
    sns.lineplot(data=df, x='Test_Time', y='Voltage', ax=axes[0,0])
    axes[0,0].set_title('Voltage Profile')
    
    # Current vs Time
    sns.lineplot(data=df, x='Test_Time', y='Current', ax=axes[0,1])
    axes[0,1].set_title('Current Profile')
    
    # Capacity vs Cycle
    # Aggregate max capacity per cycle
    if 'Cycle_Index' in df.columns:
        cycle_data = df.groupby('Cycle_Index')[['Charge_Capacity', 'Discharge_Capacity']].max().reset_index()
        sns.lineplot(data=cycle_data, x='Cycle_Index', y='Discharge_Capacity', ax=axes[1,0], label='Discharge')
        sns.lineplot(data=cycle_data, x='Cycle_Index', y='Charge_Capacity', ax=axes[1,0], label='Charge')
        axes[1,0].set_title('Capacity Fade')
        axes[1,0].set_ylabel('Capacity (Ah)')
        
    # dV/dt vs Voltage (Differential Voltage)
    if 'dV/dt' in df.columns:
        sns.scatterplot(data=df, x='Voltage', y='dV/dt', ax=axes[1,1], alpha=0.1, s=10)
        axes[1,1].set_title('Differential Voltage (dV/dt vs V)')
        axes[1,1].set_ylim(-0.1, 0.1) # Zoom in to relevant range
    
    plt.tight_layout()
    plt.show()

for f in top_files:
    plot_file_data(f)

## 4. Combined Comparison

Comparing Voltage profiles across multiple files to spot identical or divergent datasets.

In [None]:
plt.figure(figsize=(15, 8))
for f in top_files:
    df = loader.load_data(f)
    # Downsample for clearer plotting if needed
    if len(df) > 10000:
        df_plot = df.iloc[::100] 
    else:
        df_plot = df
        
    plt.plot(df_plot['Test_Time'], df_plot['Voltage'], label=f, alpha=0.7)

plt.title('Combined Voltage Profiles')
plt.xlabel('Test Time (s)')
plt.ylabel('Voltage (V)')
plt.legend()
plt.show()