<a href="https://colab.research.google.com/github/shruti-sriniv/Battery-Failure-Modelling/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load in the data
df = pd.read_csv('device_data.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tape_id          100000 non-null  object 
 1   ts               100000 non-null  int64  
 2   e0               13516 non-null   float64
 3   l0               13516 non-null   float64
 4   t0               100000 non-null  float64
 5   rssi0            100000 non-null  int64  
 6   clat             100000 non-null  float64
 7   clon             100000 non-null  float64
 8   timeoutperiod    100000 non-null  int64  
 9   nlps             100000 non-null  int64  
 10  facility         99067 non-null   object 
 11  activation_date  99775 non-null   object 
 12  bat              100000 non-null  float64
 13  hw_version       84979 non-null   object 
 14  fw_version       84979 non-null   object 
 15  otaflag          100000 non-null  bool   
 16  application_id   99986 non-null   objec

In [None]:
# Converting ts (numeric) and activation_date (string) to datetime
df['ts_utc'] = pd.to_datetime(df['ts'], unit='s', utc=True)
df['activation_date'] = pd.to_datetime(df['activation_date'])

In [None]:
# Calculate the end of the early life window for each record's device
df['early_life_window_end'] = df['activation_date'] + timedelta(weeks=2)

In [None]:
print(df.head(100))

         tape_id          ts   e0     l0       t0  rssi0       clat  \
0   CBBC86FB37E4  1750292082  0.0 -255.0    23.73   -100   0.000000   
1   E76D64301481  1750292165  NaN    NaN  9999.00    -87  40.706474   
2   E76D64301481  1750292165  NaN    NaN  9999.00    -87  40.706474   
3   E563E46C4F0D  1750292167  NaN    NaN  9999.00    -81  40.706474   
4   E563E46C4F0D  1750292167  NaN    NaN  9999.00    -81  40.706474   
..           ...         ...  ...    ...      ...    ...        ...   
95  E76D64301481  1750292411  NaN    NaN  9999.00    -85  40.706474   
96  E76D64301481  1750292411  NaN    NaN  9999.00    -85  40.706474   
97  C86BA1BBD7A3  1750292413  NaN    NaN  9999.00    -87  40.706474   
98  CD37CDC4684E  1750292414  NaN    NaN  9999.00    -86  40.706474   
99  E4DCEBAC5ED9  1750292414  NaN    NaN  9999.00    -85  40.706474   

         clon  timeoutperiod  nlps                              facility  \
0    0.000000           1800     1  6735A55C-B4C2-4193-A2C2-2AC18C23ED3

In [None]:
# Some of the ts values are negative or are in the 1970s. These values need to be dropped from the dataframe as the devices were only activated after 2023
df = df[df['ts_utc'] >= '2023-01-01']
print(df.head())

        tape_id          ts   e0     l0       t0  rssi0       clat       clon  \
0  CBBC86FB37E4  1750292082  0.0 -255.0    23.73   -100   0.000000   0.000000   
1  E76D64301481  1750292165  NaN    NaN  9999.00    -87  40.706474 -74.010361   
2  E76D64301481  1750292165  NaN    NaN  9999.00    -87  40.706474 -74.010361   
3  E563E46C4F0D  1750292167  NaN    NaN  9999.00    -81  40.706474 -74.010361   
4  E563E46C4F0D  1750292167  NaN    NaN  9999.00    -81  40.706474 -74.010361   

   timeoutperiod  nlps                              facility  \
0           1800     1  6735A55C-B4C2-4193-A2C2-2AC18C23ED36   
1           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   
2           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   
3           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   
4           3600    12  0F028EE2-3BF1-45CF-85ED-297A337F4991   

      activation_date    bat   hw_version  fw_version  otaflag  \
0 2025-06-04 17:59:15  3.011  ONYX0208071  GBT01010B0     True

In [None]:
# Create a dataframe with the most recent timestamp and battery, as well as the aggregate of the other features
