In [11]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n = 1000

# Battery status: 0 = Good, 1 = Mid, 2 = Risk
battery_status = np.random.choice([0, 1, 2], size=n, p=[0.35, 0.35, 0.3])

# Simulate features based on battery status
soc = np.where(battery_status == 0, np.random.uniform(70, 100, size=n),  # Good
               np.where(battery_status == 1, np.random.uniform(40, 70, size=n),  # Mid
                        np.random.uniform(10, 40, size=n)))  # Risk

cycle_count = np.where(battery_status == 0, np.random.randint(100, 500, size=n),  # Good
                       np.where(battery_status == 1, np.random.randint(500, 1000, size=n),  # Mid
                                np.random.randint(1000, 1500, size=n)))  # Risk

temperature = np.where(battery_status == 0, np.random.uniform(15, 30, size=n),  # Good
                       np.where(battery_status == 1, np.random.uniform(30, 40, size=n),  # Mid
                                np.random.uniform(40, 60, size=n)))  # Risk

voltage = np.where(battery_status == 0, np.random.uniform(3.5, 4.2, size=n),  # Good
                   np.where(battery_status == 1, np.random.uniform(3.2, 3.5, size=n),  # Mid
                            np.random.uniform(2.8, 3.2, size=n)))  # Risk

current = np.where(battery_status == 0, np.random.uniform(1, 10, size=n),  # Good
                   np.where(battery_status == 1, np.random.uniform(10, 20, size=n),  # Mid
                            np.random.uniform(20, 30, size=n)))  # Risk

# Introduce outliers (randomly multiply some values by extreme factors)
outlier_indices = np.random.choice(n, size=int(n * 0.03), replace=False)  # 3% of the data
cycle_count[outlier_indices] = cycle_count[outlier_indices] * np.random.uniform(2, 5, size=len(outlier_indices))
temperature[outlier_indices] = temperature[outlier_indices] * np.random.uniform(2, 3, size=len(outlier_indices))

# Introduce some false correlations (swapping values between good and risk categories)
false_relation_indices = np.random.choice(n, size=int(n * 0.1), replace=False)  # 10% of the data
voltage[false_relation_indices] = np.where(battery_status[false_relation_indices] == 0, 
                                           np.random.uniform(2.5, 3.0, size=len(false_relation_indices)),  # Bad voltage for good
                                           np.random.uniform(4.0, 4.5, size=len(false_relation_indices)))  # Good voltage for risk

# Introduce missing values randomly (5% of data for each feature)
missing_value_indices = np.random.choice(n, size=int(n * 0.05), replace=False)

soc[missing_value_indices]=soc[missing_value_indices].astype('float')
cycle_count[missing_value_indices]=cycle_count[missing_value_indices].astype('float')
temperature[missing_value_indices]=temperature[missing_value_indices].astype('float')
voltage[missing_value_indices]=voltage[missing_value_indices].astype('float')
current[missing_value_indices]=current[missing_value_indices].astype('float')


soc[missing_value_indices] = np.nan
temperature[missing_value_indices] = np.nan
voltage[missing_value_indices] = np.nan
current[missing_value_indices] = np.nan

# Health Status (0 = Good, 1 = Mid, 2 = Risk)



# Create a DataFrame
df = pd.DataFrame({
    'State_of_Charge (%)': np.round(soc, 2),
    'Cycle_Count': cycle_count,
    'Temperature (°C)': np.round(temperature, 2),
    'Voltage (V)': np.round(voltage, 2),
    'Current (A)': np.round(current, 2),
    'Health_Status':battery_status  # 0 = Good, 1 = Mid, 2 = Risk
})

df['Health_Status']=df['Health_Status'].map({0:"good",1:"mid",2:"risk"})

# Save the dataset to a CSV file
df.to_csv('ev_battery_data.csv', index=False)



In [12]:
df.head()

Unnamed: 0,State_of_Charge (%),Cycle_Count,Temperature (°C),Voltage (V),Current (A),Health_Status
0,47.85,884,37.44,3.27,17.58,mid
1,33.9,1262,57.42,3.08,28.96,risk
2,17.51,1039,46.85,2.84,28.62,risk
3,47.49,699,33.46,3.34,10.72,mid
4,94.2,126,19.29,2.91,2.3,good


In [13]:
df.isna().sum()

State_of_Charge (%)    50
Cycle_Count             0
Temperature (°C)       50
Voltage (V)            50
Current (A)            50
Health_Status           0
dtype: int64

In [14]:
df['Health_Status'].value_counts()

Health_Status
good    365
mid     347
risk    288
Name: count, dtype: int64