In [28]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, LSTM, Masking
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import shap


In [30]:
# Print DL Model Results
def dl_model_results(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)
    model_accuracy = accuracy_score(y_test, y_pred)
    model_classification_report = classification_report(y_test, y_pred)
    model_confusion_matrix = confusion_matrix(y_test, y_pred)
    print(f"Accuracy: {model_accuracy:.4f}")
    print("Classification Report:")
    print(model_classification_report)
    print("Confusion Matrix:")
    print(model_confusion_matrix)
    del y_pred
    return model_accuracy, model_classification_report, model_confusion_matrix

In [3]:
# Set the path to the dataset
datasetspath = "/Users/saransathy/WalshDBA/Capstone"

In [4]:
# Read Data
mv_dataset3 = pd.read_pickle(f"{datasetspath}/battery_dataset3_normalised.pkl")
etron_df = pd.read_pickle(f"{datasetspath}/Charge_normalised.pkl")

In [5]:
mv_dataset3.sample()

Unnamed: 0,label,mileage,capacity,car,charge_segment,Curr,SoC,Temp,Volt,MaxVolt,MinVolt,MinTemp
7048363,0,17642.592,0.0,506,76,0.544361,-2.726636,-2.382223,0.818902,0.931489,0.790114,0.94679


In [6]:
# For Modelling purpose the Battery Dataset from 3rd Manufacturer will be used along with
# Audi eTron Data. Need to update Audi eTron data with Charge Segment and Capacity and assume 
# the battery is healthy
# Get unique dates in index in order
unique_dates = pd.Series(etron_df.index.date).drop_duplicates().reset_index(drop=True)

# Map each unique date to its CS value (starting at 0)
cs_map = {date: idx for idx, date in enumerate(unique_dates)}

# Assign CS column based on mapping
etron_df['charge_segment'] = [cs_map[d] for d in etron_df.index.date]
etron_df['car'] = np.max(mv_dataset3['car'])+100
etron_df['mileage'] = 0
etron_df['capacity'] = 95
etron_df['label'] = '00'

# Release Memory
del unique_dates

In [7]:
etron_df.sample()

Unnamed: 0,Curr,Volt,SoC,Temp,charge_segment,car,mileage,capacity,label
2019-12-22 05:01:30,0.334963,0.717473,0.704492,-0.633075,5,649,0,95,0


In [8]:
df = pd.concat([mv_dataset3.drop(['MaxVolt','MinVolt','MinTemp'], axis=1), etron_df], axis=0, ignore_index=True)
df.sample()

Unnamed: 0,label,mileage,capacity,car,charge_segment,Curr,SoC,Temp,Volt
5883367,0,51771.456,38.807835,503,314,0.254016,-1.604704,-1.331636,-1.109419


In [9]:
df[df['car']==649].sample()

Unnamed: 0,label,mileage,capacity,car,charge_segment,Curr,SoC,Temp,Volt
22637238,0,0.0,95.0,649,43,0.334963,-0.493092,1.018867,-0.595558


In [10]:
df.dtypes

label              object
mileage           float64
capacity          float64
car                 int64
charge_segment     object
Curr              float64
SoC               float64
Temp              float64
Volt              float64
dtype: object

In [11]:
# Perform Zscalar Normalization on the merged data
cols = ["Curr","Volt","SoC","Temp"]
df[cols] = (df[cols] - df[cols].mean()) / df[cols].std(ddof=0)

In [12]:
cols1 = ['mileage', 'capacity','Curr','SoC','Temp','Volt']
cols2 = ['Curr','SoC','Temp','Volt']
cols3 = ['mileage', 'capacity','Curr','SoC','Temp','Volt', 'Curr_lag','SoC_lag','Temp_lag','Volt_lag']
cols4 = ['Curr','SoC','Temp','Volt', 'Curr_lag','SoC_lag','Temp_lag','Volt_lag']

In [14]:
# Prepare Data
sequences = []
labels = []
for name, group in df.groupby(['car','charge_segment']):
    seq = group[cols2].values
    sequences.append(seq)
    # Use target of first row in each group for label
    label = 0
    if (group['label'].iloc[0] == '10'):
        label = 1
    labels.append(label)

In [15]:
len(sequences), len(labels)

(12430, 12430)

In [16]:
# Pad sequences to the same length (maxlen)
X = pad_sequences(sequences, dtype='float32', padding='post', value=0.0)
y = np.array(labels)
# Split Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Release memory
del mv_dataset3, etron_df, df, X, y, sequences, labels

In [18]:
# Model
model = Sequential([
    Masking(mask_value=0.0, input_shape=(X_train.shape[1], X_train.shape[2])),  # Mask the pads
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [19]:
# Train
model.fit(X_train, y_train, epochs=5, batch_size=1)

Epoch 1/5
[1m8701/8701[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71050s[0m 8s/step - accuracy: 0.6582 - loss: 0.6206
Epoch 2/5
[1m8701/8701[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72035s[0m 8s/step - accuracy: 0.6752 - loss: 0.6009
Epoch 3/5
[1m8701/8701[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70600s[0m 8s/step - accuracy: 0.6938 - loss: 0.5822
Epoch 4/5
[1m8701/8701[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71294s[0m 8s/step - accuracy: 0.7053 - loss: 0.5730
Epoch 5/5
[1m8701/8701[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68995s[0m 8s/step - accuracy: 0.7061 - loss: 0.5793


<keras.src.callbacks.history.History at 0x1652de5a0>

In [21]:
model.save(f"{datasetspath}/lstm_model.keras")

In [31]:
dl_model_results(model, X_test, y_test)

[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 3s/step
Accuracy: 0.7112
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.49      0.60      1619
           1       0.69      0.88      0.77      2110

    accuracy                           0.71      3729
   macro avg       0.73      0.69      0.69      3729
weighted avg       0.72      0.71      0.70      3729

Confusion Matrix:
[[ 798  821]
 [ 256 1854]]


(0.7111826226870475,
 '              precision    recall  f1-score   support\n\n           0       0.76      0.49      0.60      1619\n           1       0.69      0.88      0.77      2110\n\n    accuracy                           0.71      3729\n   macro avg       0.73      0.69      0.69      3729\nweighted avg       0.72      0.71      0.70      3729\n',
 array([[ 798,  821],
        [ 256, 1854]]))