<a href="https://colab.research.google.com/github/shruti-sriniv/Battery-Failure-Modelling/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
# Load in the data
df = pd.read_csv('device_data.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tape_id          100000 non-null  object 
 1   ts               100000 non-null  int64  
 2   e0               13516 non-null   float64
 3   l0               13516 non-null   float64
 4   t0               100000 non-null  float64
 5   rssi0            100000 non-null  int64  
 6   clat             100000 non-null  float64
 7   clon             100000 non-null  float64
 8   timeoutperiod    100000 non-null  int64  
 9   nlps             100000 non-null  int64  
 10  facility         99067 non-null   object 
 11  activation_date  99775 non-null   object 
 12  bat              100000 non-null  float64
 13  hw_version       84979 non-null   object 
 14  fw_version       84979 non-null   object 
 15  otaflag          100000 non-null  bool   
 16  application_id   99986 non-null   objec

In [50]:
# Converting ts (numeric) and activation_date (string) to datetime
df['timestamp_utc'] = pd.to_datetime(df['ts'], unit='s', utc=True)
df['activation_date'] = pd.to_datetime(df['activation_date']).dt.tz_localize(None)
df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc']).dt.tz_localize(None)

In [51]:
# Some of the ts values are negative or are in the 1970s. These values need to be dropped from the dataframe as the devices were only activated after 2023
df = df[df['timestamp_utc'] >= '2023-01-01']
print(df.head())

        tape_id          ts   e0     l0       t0  rssi0       clat       clon  \
0  CBBC86FB37E4  1750292082  0.0 -255.0    23.73   -100   0.000000   0.000000   
1  E76D64301481  1750292165  NaN    NaN  9999.00    -87  40.706474 -74.010361   
2  E76D64301481  1750292165  NaN    NaN  9999.00    -87  40.706474 -74.010361   
3  E563E46C4F0D  1750292167  NaN    NaN  9999.00    -81  40.706474 -74.010361   
4  E563E46C4F0D  1750292167  NaN    NaN  9999.00    -81  40.706474 -74.010361   

   timeoutperiod  nlps                              facility  \
0           1800     1  6735A55C-B4C2-4193-A2C2-2AC18C23ED36   
1           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   
2           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   
3           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   
4           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   

      activation_date    bat   hw_version  fw_version  otaflag  \
0 2025-06-04 17:59:15  3.011  ONYX0208071  GBT01010B0     True

In [52]:
# Sort the data by tape_id and then by ts_utc
df = df.sort_values(by=['tape_id', 'timestamp_utc'])

In [53]:
# Define early life battery failure
# Calculate the time difference from activation for each measurement
df['time_since_activation_days'] = (df['timestamp_utc'] - df['activation_date']).dt.total_seconds() / (24 * 3600)

# Identify potential failure points (voltage < 2.6)
df['is_low_voltage'] = (df['bat'] < 2.6).astype(int)

In [54]:
# Group by tape_id and check for early life failure
early_failure_status = df.groupby('tape_id').apply(
    lambda x: ((x['is_low_voltage'] == 1) & (x['time_since_activation_days'] <= 14)).any()
).astype(int)

  early_failure_status = df.groupby('tape_id').apply(


In [55]:
# Correct the dataframe by adding in the new column and removing the interim ones
# .map() is ideal here as it aligns values based on the 'tape_id' column
df['early_life_failure'] = df['tape_id'].map(early_failure_status)

# You can drop the intermediate columns if not needed
df = df.drop(columns=['time_since_activation_days', 'is_low_voltage'])

Prepping the data for the LSTM Model