In [1]:
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
#Sets a random state for reproducibility (match prior scripts' style)
RANDOM_STATE = 230


In [3]:
#Columns to remove (as provided)
DROP_COLS = [
    'Ambient Temperature',
    'Ambient Humidity',
    'Load Weight',
    'Driving Speed',
    'Distance Traveled',
    'Idle Time',
    'Route Roughness',
    'Timestamp',
]


In [4]:
df = pd.read_csv('EV_Predictive_Maintenance_Dataset_15min.csv')


In [5]:
df.head()

Unnamed: 0,Timestamp,SoC,SoH,Battery_Voltage,Battery_Current,Battery_Temperature,Charge_Cycles,Motor_Temperature,Motor_Vibration,Motor_Torque,...,Load_Weight,Driving_Speed,Distance_Traveled,Idle_Time,Route_Roughness,RUL,Failure_Probability,Maintenance_Type,TTF,Component_Health_Score
0,2020-01-01 00:00:00,0.826099,0.941338,210.163881,-22.753095,27.149201,149.19093,48.496049,0.369095,113.435589,...,741.754518,103.421162,66.232383,0.520922,0.22597,260.503381,0,1,111.116697,0.852745
1,2020-01-01 00:15:00,0.064728,0.916059,364.000102,-27.70112,53.655101,171.702388,57.829492,1.449195,105.58716,...,769.134035,46.041935,3.146238,0.844005,0.20435,212.813954,0,2,179.229425,0.827616
2,2020-01-01 00:30:00,0.873643,0.90802,388.855089,-36.646406,29.55909,191.617645,46.518363,1.859045,119.610302,...,917.262931,59.588422,79.909148,0.992405,0.175125,273.394511,0,1,171.852663,0.876887
3,2020-01-01 00:45:00,0.853009,0.916476,370.570602,-37.609429,29.690283,111.881817,54.163681,0.3815,182.535625,...,600.598736,44.222285,0.774,0.007615,0.213264,229.508442,0,0,165.221328,0.81629
4,2020-01-01 01:00:00,0.94754,0.913206,390.011904,-14.275808,28.864338,163.774377,42.075978,0.433927,173.298044,...,613.153029,41.374684,2.872124,0.771938,0.770257,257.302631,1,0,176.890659,0.74426


In [6]:
orig_shape = df.shape

In [7]:
#Normalize header spacing to align with provided column names (keep values unchanged)
df.columns = [c.strip() for c in df.columns]

In [8]:
#Drops irrelevant columns from the dataframe (ignore if any are absent)
to_drop = []
normalized = {c.replace('_', '').replace(' ', '').lower(): c for c in df.columns}
for col in DROP_COLS:
    key = col.replace('_', '').replace(' ', '').lower()
    if key in normalized:
        to_drop.append(normalized[key])

if to_drop:
    df.drop(columns=to_drop, inplace=True, errors='ignore')
print(f"Dropped columns: {to_drop if to_drop else 'None'}")

Dropped columns: ['Ambient_Temperature', 'Ambient_Humidity', 'Load_Weight', 'Driving_Speed', 'Distance_Traveled', 'Idle_Time', 'Route_Roughness', 'Timestamp']


In [9]:
#Randomize the row order with a fixed seed and reset the index
df = df.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)


In [10]:
# Saves the modified dataframe to a new CSV file
df.to_csv(str('EV_cleaned.csv'), index=False)

In [11]:
# Perform 80/10/10 split on the cleaned dataframe
EV_train, remaining_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)
EV_val, EV_test = train_test_split(remaining_df, test_size=0.5, random_state=RANDOM_STATE)


In [12]:
# Build split file paths based on the cleaned output path



EV_train.to_csv()
EV_val.to_csv()
EV_test.to_csv()

',SoC,SoH,Battery_Voltage,Battery_Current,Battery_Temperature,Charge_Cycles,Motor_Temperature,Motor_Vibration,Motor_Torque,Motor_RPM,Power_Consumption,Brake_Pad_Wear,Brake_Pressure,Reg_Brake_Efficiency,Tire_Pressure,Tire_Temperature,Suspension_Load,RUL,Failure_Probability,Maintenance_Type,TTF,Component_Health_Score\r\n99271,0.9815413046612204,0.953818910697846,357.9453485005933,-12.368996888218303,53.27691568008468,143.51090819309482,98.39131642360702,0.4377262780185897,187.3498247836961,1953.7676883556123,23.480628090848025,0.1729700265514576,49.17115383642083,0.822105478410096,21.562892772646485,34.59082265021839,490.4912429521699,218.82134647262905,0,0,159.48844673879094,0.0104723251218909\r\n33125,0.1782246615265053,0.9171671490078,359.0022132252168,-49.33774843404535,34.19724407936843,670.9624329419207,42.96099107885807,0.380955202777943,118.19620576579202,1818.10508451471,23.60961881658536,0.2793646062763891,89.28460565173263,0.8887219817990595,34.901766236547715,31.2830542910378

In [13]:

print("Saved splits:")
print(f"  Train (80%): {EV_train.shape}")
print(f"  Validation (10%): {EV_val.shape}")
print(f"  Test (10%): {EV_test.shape}")

Saved splits:
  Train (80%): (140314, 22)
  Validation (10%): (17539, 22)
  Test (10%): (17540, 22)
