# Battery Analytics Lab - Data Familiarization

## Phase 1: Data Ingestion & Standardization

This notebook provides initial exploration and familiarization with the processed battery data.

**Objectives:**
- Explore standardized data structure and quality
- Generate summary statistics and visualizations
- Document data quality findings and anomalies
- Review processing logs and metadata

**Generated:** 2025-12-29
**Author:** Battery Analytics Lab Team

## 1. Setup and Configuration

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configuration
config_path = "../config/feature_schema.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("‚úì Libraries imported successfully")
print(f"‚úì Configuration loaded from: {config_path}")

## 2. Data Discovery and Overview

In [None]:
# Discover processed data files
standardized_dir = Path("../data/standardized/")
validated_passed_dir = Path("../data/validated/passed/")
validated_failed_dir = Path("../data/validated/failed/")

# List available files
standardized_files = list(standardized_dir.glob("*.parquet")) if standardized_dir.exists() else []
passed_files = list(validated_passed_dir.glob("*.parquet")) if validated_passed_dir.exists() else []
failed_files = list(validated_failed_dir.glob("*.parquet")) if validated_failed_dir.exists() else []

print("üìÅ Data File Inventory:")
print(f"   Standardized files: {len(standardized_files)}")
print(f"   Validation passed: {len(passed_files)}")
print(f"   Validation failed: {len(failed_files)}")
print(f"   Total processed: {len(standardized_files)}")

## 3. Load and Examine Standardized Data

In [None]:
# Load first standardized file for examination
if standardized_files:
    sample_file = standardized_files[0]
    print(f"Loading sample file: {sample_file.name}")
    
    df_sample = pd.read_parquet(sample_file)
    print(f"\nüìä Dataset Shape: {df_sample.shape}")
    print(f"üìä Columns: {list(df_sample.columns)}")
    
    # Display basic info
    print("\nüîç Data Types:")
    print(df_sample.dtypes)
    
    print("\nüìà First 5 rows:")
    display(df_sample.head())
    
    print("\nüìä Basic Statistics:")
    display(df_sample.describe())
else:
    print("‚ùå No standardized files found. Please run the data ingestion pipeline first.")

## 4. Data Quality Assessment

In [None]:
# Assess data quality
if standardized_files:
    print("üîç Data Quality Assessment:")
    
    # Missing values
    missing_data = df_sample.isnull().sum()
    missing_percentage = (missing_data / len(df_sample)) * 100
    
    missing_summary = pd.DataFrame({
        'Missing_Count': missing_data,
        'Missing_Percentage': missing_percentage
    })
    
    print("\nüìâ Missing Data Summary:")
    display(missing_summary[missing_summary['Missing_Count'] > 0])
    
    # Data range validation
    value_ranges = config['raw_data_schema']['value_ranges']
    print("\nüéØ Value Range Validation:")
    
    for col, range_info in value_ranges.items():
        if col in df_sample.columns:
            col_data = df_sample[col].dropna()
            if len(col_data) > 0:
                min_val, max_val = col_data.min(), col_data.max()
                within_range = (min_val >= range_info['min']) and (max_val <= range_info['max'])
                status = "‚úì PASS" if within_range else "‚ùå FAIL"
                
                print(f"   {col}: [{min_val:.3f}, {max_val:.3f}] {range_info['unit']} "
                      f"(Expected: [{range_info['min']}-{range_info['max']}]) {status}")

## 5. Data Visualizations

In [None]:
# Create visualizations
if standardized_files and len(df_sample) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Battery Data Overview', fontsize=16)
    
    # Voltage over time
    if 'timestamp' in df_sample.columns and 'voltage_v' in df_sample.columns:
        axes[0,0].plot(df_sample['timestamp'], df_sample['voltage_v'], alpha=0.7)
        axes[0,0].set_title('Voltage vs Time')
        axes[0,0].set_xlabel('Time (s)')
        axes[0,0].set_ylabel('Voltage (V)')
    
    # Current over time
    if 'timestamp' in df_sample.columns and 'current_a' in df_sample.columns:
        axes[0,1].plot(df_sample['timestamp'], df_sample['current_a'], alpha=0.7, color='orange')
        axes[0,1].set_title('Current vs Time')
        axes[0,1].set_xlabel('Time (s)')
        axes[0,1].set_ylabel('Current (A)')
    
    # Voltage distribution
    if 'voltage_v' in df_sample.columns:
        axes[1,0].hist(df_sample['voltage_v'].dropna(), bins=30, alpha=0.7, color='green')
        axes[1,0].set_title('Voltage Distribution')
        axes[1,0].set_xlabel('Voltage (V)')
        axes[1,0].set_ylabel('Frequency')
    
    # Phase type distribution
    if 'phase_type' in df_sample.columns:
        phase_counts = df_sample['phase_type'].value_counts()
        axes[1,1].pie(phase_counts.values, labels=phase_counts.index, autopct='%1.1f%%')
        axes[1,1].set_title('Phase Type Distribution')
    
    plt.tight_layout()
    plt.show()
else:
    print("‚ùå No data available for visualization")

## 6. Processing Logs Review

In [None]:
# Review processing logs
logs_dir = Path("../logs/")
logs_files = list(logs_dir.glob("*.log")) if logs_dir.exists() else []

print("üìã Processing Logs Review:")
print(f"   Log files found: {len(logs_files)}")

for log_file in logs_files:
    print(f"\nüìÑ {log_file.name}:")
    try:
        with open(log_file, 'r') as f:
            lines = f.readlines()
            print(f"   Total lines: {len(lines)}")
            
            # Show last 5 lines as summary
            if len(lines) >= 5:
                print("   Last 5 entries:")
                for line in lines[-5:]:
                    print(f"   {line.strip()}")
    except Exception as e:
        print(f"   ‚ùå Error reading log: {e}")

## 7. Metadata Analysis

In [None]:
# Analyze metadata
metadata_files = {
    'cell_registry': Path("../metadata/cell_registry.csv"),
    'experiment_log': Path("../metadata/experiment_log.csv")
}

print("üìä Metadata Analysis:")

for name, file_path in metadata_files.items():
    if file_path.exists():
        print(f"\nüìÑ {name}:")
        try:
            df_meta = pd.read_csv(file_path)
            print(f"   Records: {len(df_meta)}")
            print(f"   Columns: {list(df_meta.columns)}")
            
            if len(df_meta) > 0:
                display(df_meta.head())
        except Exception as e:
            print(f"   ‚ùå Error reading {name}: {e}")
    else:
        print(f"\n‚ö†Ô∏è  {name}: File not found")

## 8. Summary and Findings

In [None]:
# Generate summary report
print("üìã PHASE 1 DATA FAMILIARIZATION SUMMARY")
print("=" * 50)

if standardized_files:
    print(f"‚úì Data Files Processed: {len(standardized_files)}")
    print(f"‚úì Validation Success Rate: {len(passed_files)}/{len(standardized_files)} files")
    print(f"‚úì Sample Dataset Shape: {df_sample.shape}")
    
    # Calculate data quality metrics
    total_cells = df_sample.shape[0] * df_sample.shape[1]
    missing_cells = df_sample.isnull().sum().sum()
    completeness = (total_cells - missing_cells) / total_cells * 100
    
    print(f"‚úì Data Completeness: {completeness:.1f}%")
    
    print("\nüìà Key Findings:")
    print(f"   ‚Ä¢ Dataset contains {len(df_sample):,} data points")
    print(f"   ‚Ä¢ Data spans {df_sample['timestamp'].max() - df_sample['timestamp'].min():.1f} seconds" if 'timestamp' in df_sample.columns else "   ‚Ä¢ Timestamp data available")
    print(f"   ‚Ä¢ Voltage range: {df_sample['voltage_v'].min():.2f} - {df_sample['voltage_v'].max():.2f} V" if 'voltage_v' in df_sample.columns else "   ‚Ä¢ Voltage data available")
    
    if 'phase_type' in df_sample.columns:
        phase_dist = df_sample['phase_type'].value_counts()
        print(f"   ‚Ä¢ Phase distribution: {dict(phase_dist)}")
        
    print("\n‚úÖ RECOMMENDATIONS:")
    print("   ‚Ä¢ Data appears suitable for downstream analysis")
    print("   ‚Ä¢ Consider investigating any validation failures")
    print("   ‚Ä¢ Proceed to Phase 2 (Cycle Analysis)")
    
else:
    print("‚ùå No processed data found")
    print("üîß Next Steps:")
   1. Run data ingestion pipeline")
   2. Check processing logs for errors")
   3. Verify source data availability")

---

**Generated by Battery Analytics Lab - Phase 1 Data Familiarization**  
**Date:** 2025-12-29  
**Status:** Ready for Phase 2 (Cycle Analysis)