# Battery Management System Dataset - Exploratory Data Analysis (EDA)

This notebook performs a comprehensive exploratory data analysis on the Battery Management System (BMS) dataset from the `cleaned_dataset/data/` directory. The BMS dataset contains multiple CSV files with battery cycle data including current, voltage, and temperature measurements.


In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")


## Load Data Files
Loading all CSV files from the cleaned_dataset/data directory and combining them.


In [None]:
# Load all data files from cleaned_dataset/data directory
data_dir = Path('/home/harshit/Documents/bms/cleaned_dataset/data')
csv_files = sorted(data_dir.glob('*.csv'))

print(f"Found {len(csv_files)} CSV files in the data directory")
print(f"Files: {[f.name for f in csv_files[:5]]} ... and {len(csv_files) - 5} more")

# Load first file to understand structure
first_df = pd.read_csv(csv_files[0])
print(f"\nFirst file ({csv_files[0].name}) shape: {first_df.shape}")
print(f"Columns: {first_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(first_df.head())


In [None]:
# Load and combine all CSV files
dfs = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df['file_id'] = csv_file.stem  # Add file identifier
    dfs.append(df)

# Combine all dataframes
combined_df = pd.concat(dfs, ignore_index=True)
print(f"Combined dataset shape: {combined_df.shape}")
print(f"Total rows: {len(combined_df):,}")
print(f"Total columns: {len(combined_df.columns)}")


## Display Basic Data Information
Examining the structure, data types, and content of the dataset.


In [None]:
# Display dataset information
print("=" * 80)
print("DATASET INFORMATION")
print("=" * 80)
print(combined_df.info())
print("\n" + "=" * 80)
print("FIRST FEW ROWS")
print("=" * 80)
print(combined_df.head(10))


## Generate Descriptive Statistics
Statistical summary of numerical features including mean, median, std, and quantiles.


In [None]:
# Descriptive statistics for numerical columns
print("=" * 80)
print("DESCRIPTIVE STATISTICS")
print("=" * 80)
print(combined_df.describe().T)

print("\n" + "=" * 80)
print("DATA TYPES COUNT")
print("=" * 80)
print(combined_df.dtypes.value_counts())


## Check for Missing Values
Identifying missing values in the dataset and visualizing their distribution.


In [None]:
# Check for missing values
print("=" * 80)
print("MISSING VALUES ANALYSIS")
print("=" * 80)

missing_values = combined_df.isnull().sum()
missing_percent = (combined_df.isnull().sum() / len(combined_df)) * 100
missing_df = pd.DataFrame({
    'Column': combined_df.columns,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percent.values
}).sort_values('Missing Count', ascending=False)

print(missing_df)
print(f"\nTotal missing values: {combined_df.isnull().sum().sum():,}")

# Visualize missing values
if missing_values.sum() > 0:
    plt.figure(figsize=(12, 6))
    missing_percent[missing_percent > 0].sort_values(ascending=False).plot(kind='barh')
    plt.xlabel('Percentage Missing (%)')
    plt.title('Missing Values Distribution')
    plt.tight_layout()
    plt.show()
else:
    print("\nâœ“ No missing values detected in the dataset!")


## Visualize Data Distributions
Creating histograms, box plots, and density plots for numerical features.


In [None]:
# Get numerical columns only (excluding file_id)
numerical_cols = combined_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns: {numerical_cols}")

# Create distribution plots for numerical columns
if len(numerical_cols) > 0:
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten()
    
    for idx, col in enumerate(numerical_cols):
        axes[idx].hist(combined_df[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(axis='y', alpha=0.3)
    
    # Hide unused subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()


In [None]:
# Create box plots for numerical columns
if len(numerical_cols) > 0:
    fig, axes = plt.subplots(1, len(numerical_cols), figsize=(15, 5))
    if len(numerical_cols) == 1:
        axes = [axes]
    
    for idx, col in enumerate(numerical_cols):
        axes[idx].boxplot(combined_df[col].dropna())
        axes[idx].set_title(f'Box Plot of {col}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel(col)
        axes[idx].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()


## Explore Correlations and Relationships
Creating correlation matrices and heatmaps to identify relationships between variables.


In [None]:
# Calculate correlation matrix (excluding file_id)
numerical_df = combined_df.select_dtypes(include=[np.number])

if len(numerical_df.columns) > 1:
    correlation_matrix = numerical_df.corr()
    
    # Create correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\nCorrelation Matrix:")
    print(correlation_matrix)
    
    # Find strong correlations
    print("\n" + "=" * 80)
    print("STRONG CORRELATIONS (|r| > 0.7)")
    print("=" * 80)
    
    strong_corr = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_value = correlation_matrix.iloc[i, j]
            if abs(corr_value) > 0.7:
                strong_corr.append({
                    'Feature 1': correlation_matrix.columns[i],
                    'Feature 2': correlation_matrix.columns[j],
                    'Correlation': corr_value
                })
    
    if strong_corr:
        strong_corr_df = pd.DataFrame(strong_corr).sort_values('Correlation', key=abs, ascending=False)
        print(strong_corr_df)
    else:
        print("No strong correlations found (|r| > 0.7)")
else:
    print("Not enough numerical columns to calculate correlation")


## Data Quality Summary
Summary of key data quality metrics and insights.
