# Data format refinement for phase 1 data
## Primary refinement of columns, data etc.
### Phase 1 data has all the voltage columns together, the data is not divided modulewise
The column names of input dataframe for this are in CANID.$<$column_name$>$ format.<br>
They are renamed, additional necessary columns are added and so on...<br>

Note for writing: implementation for phase 1 data type

In [1]:
%reload_ext autoreload
# for auto reloading modules without kernel restart
# If this does not work to import custom modules, then restart kernel

In [2]:
# increase default window size for notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

  from IPython.core.display import display, HTML


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import timeit
from datetime import timedelta
from dataclasses import dataclass       # C like structure
import glob                             # finds all the pathnames matching specified pattern
import datetime as dt
import random
!python --version
print('pandas version: ' + pd.__version__)
print('numpy version: ' + np.__version__)

Python 3.9.7
pandas version: 1.4.1
numpy version: 1.21.2


In [24]:
import sys, os
cwd = sys.path[0]
sys.path.append(os.path.join(cwd, 'my_modules'))                # sys.path[0] is dir of the ipynb file
import custom_plot
import data_clean
import data_preprocess
import constants

In [25]:
%matplotlib notebook
# Plot related packages,%matplotlib notebook makes plots in jupyter interactive
# constants for plotting
x_label_elapsedtime = 0
x_label_datetime = 1

In [27]:
# load phase 1 dataframe and convert timestamp
dir_path = os.path.join(cwd, 'csv')
src_path = os.path.join(dir_path, 'df_phase1.pkl')

df = pd.read_pickle(src_path)

# Convert timestamp from python obj to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
df.drop_duplicates(subset=['timestamp'], inplace=True, keep='first')     # remove duplicate entries
df = df.sort_values(by="timestamp")    # sort
df['timestamp'] = df['timestamp'].dt.tz_localize(None)          # remove UTC parts, i.e. remove all parts after seconds

In [28]:
# Test if no duplicates and increasing
print(df['timestamp'].is_monotonic_increasing)
print('rows, cols', df.shape)

True
rows, cols (1027851, 259)


In [29]:
# Change column names, remove the can id from the begining

df.rename(columns = {
    '10.current_bms01':'current',
    '20.stringvoltage_bms01':'string_V',
    '740.minsoc_bms01':'min_SOC',
    '40.minsoh_bms01':'min_SOH',
    '3c0.currentmodule1integral_bms01':'cur_integral_module1',
    '3c0.currentmodule1_bms01':'cur_module1',
    '6c0.contactorstate_bms01':'contactor_state',
    '740.ctrofftime_bms01':'contactor_off_time',
    '7c0.brickvoltage001_bms01':'V1',
    '7c0.brickvoltage002_bms01':'V2',
    '7c0.brickvoltage003_bms01':'V3',
    '7c0.brickvoltage004_bms01':'V4',
    '7c0.brickvoltage005_bms01':'V5',
    '7c0.brickvoltage006_bms01':'V6',
    '7c0.brickvoltage007_bms01':'V7',
    '7c0.brickvoltage008_bms01':'V8',
    '7c0.brickvoltage009_bms01':'V9',
    '7c0.brickvoltage010_bms01':'V10',
    '7c0.brickvoltage011_bms01':'V11',
    '7c0.brickvoltage012_bms01':'V12',
    '7c0.brickvoltage013_bms01':'V13',
    '7c0.brickvoltage014_bms01':'V14',
    '7c0.brickvoltage015_bms01':'V15',
    '7c0.brickvoltage016_bms01':'V16',
    '7c0.brickvoltage017_bms01':'V17',
    '7c0.brickvoltage018_bms01':'V18',
    '7c0.brickvoltage019_bms01':'V19',
    '7c0.brickvoltage020_bms01':'V20',
    '7c0.brickvoltage021_bms01':'V21',
    '7c0.brickvoltage022_bms01':'V22',
    '7c0.brickvoltage023_bms01':'V23',
    '7c0.brickvoltage024_bms01':'V24',
    '7c0.brickvoltage025_bms01':'V25',
    '7c0.brickvoltage026_bms01':'V26',
    '7c0.brickvoltage027_bms01':'V27',
    '7c0.brickvoltage028_bms01':'V28',
    '7c0.brickvoltage029_bms01':'V29',
    '7c0.brickvoltage030_bms01':'V30',
    '7c0.brickvoltage031_bms01':'V31',
    '7c0.brickvoltage032_bms01':'V32',
    '7c0.brickvoltage033_bms01':'V33',
    '7c0.brickvoltage034_bms01':'V34',
    '7c0.brickvoltage035_bms01':'V35',
    '7c0.brickvoltage036_bms01':'V36',
    '7c0.brickvoltage037_bms01':'V37',
    '7c0.brickvoltage038_bms01':'V38',
    '7c0.brickvoltage039_bms01':'V39',
    '7c0.brickvoltage040_bms01':'V40',
    '7c0.brickvoltage041_bms01':'V41',
    '7c0.brickvoltage042_bms01':'V42',
    '7c0.brickvoltage043_bms01':'V43',
    '7c0.brickvoltage044_bms01':'V44',
    '7c0.brickvoltage045_bms01':'V45',
    '7c0.brickvoltage046_bms01':'V46',
    '7c0.brickvoltage047_bms01':'V47',
    '7c0.brickvoltage048_bms01':'V48',
    '7c0.brickvoltage049_bms01':'V49',
    '7c0.brickvoltage050_bms01':'V50',
    '7c0.brickvoltage051_bms01':'V51',
    '7c0.brickvoltage052_bms01':'V52',
    '7c0.brickvoltage053_bms01':'V53',
    '7c0.brickvoltage054_bms01':'V54',
    '7c0.brickvoltage055_bms01':'V55',
    '7c0.brickvoltage056_bms01':'V56',
    '7c0.brickvoltage057_bms01':'V57',
    '7c0.brickvoltage058_bms01':'V58',
    '7c0.brickvoltage059_bms01':'V59',
    '7c0.brickvoltage060_bms01':'V60',
    '7c0.brickvoltage061_bms01':'V61',
    '7c0.brickvoltage062_bms01':'V62',
    '7c0.brickvoltage063_bms01':'V63',
    '7c0.brickvoltage064_bms01':'V64',
    '7c0.brickvoltage065_bms01':'V65',
    '7c0.brickvoltage066_bms01':'V66',
    '7c0.brickvoltage067_bms01':'V67',
    '7c0.brickvoltage068_bms01':'V68',
    '7c0.brickvoltage069_bms01':'V69',
    '7c0.brickvoltage070_bms01':'V70',
    '7c0.brickvoltage071_bms01':'V71',
    '7c0.brickvoltage072_bms01':'V72',    
    '7c0.brickvoltage073_bms01':'V73',
    '7c0.brickvoltage074_bms01':'V74',
    '7c0.brickvoltage075_bms01':'V75',
    '7c0.brickvoltage076_bms01':'V76',
    '7c0.brickvoltage077_bms01':'V77',
    '7c0.brickvoltage078_bms01':'V78',
    '7c0.brickvoltage079_bms01':'V79',
    '7c0.brickvoltage080_bms01':'V80',
    '7c0.brickvoltage081_bms01':'V81',
    '7c0.brickvoltage082_bms01':'V82',
    '7c0.brickvoltage083_bms01':'V83',
    '7c0.brickvoltage084_bms01':'V84',
    '7c0.brickvoltage085_bms01':'V85',
    '7c0.brickvoltage086_bms01':'V86',
    '7c0.brickvoltage087_bms01':'V87',
    '7c0.brickvoltage088_bms01':'V88',
    '7c0.brickvoltage089_bms01':'V89',
    '7c0.brickvoltage090_bms01':'V90',
    '7c0.brickvoltage091_bms01':'V91',
    '7c0.brickvoltage092_bms01':'V92',
    '7c0.brickvoltage093_bms01':'V93',
    '7c0.brickvoltage094_bms01':'V94',
    '7c0.brickvoltage095_bms01':'V95',
    '7c0.brickvoltage096_bms01':'V96',
    '7c0.moduletemperature01_bms01':'T1',
    '7c0.moduletemperature02_bms01':'T2',
    '7c0.moduletemperature03_bms01':'T3',
    '7c0.moduletemperature04_bms01':'T4',
    '7c0.moduletemperature05_bms01':'T5',
    '7c0.moduletemperature06_bms01':'T6',
    '7c0.moduletemperature07_bms01':'T7',
    '7c0.moduletemperature08_bms01':'T8',
    '7c0.moduletemperature09_bms01':'T9',
    '7c0.moduletemperature10_bms01':'T10',
    '7c0.moduletemperature11_bms01':'T11',
    '7c0.moduletemperature12_bms01':'T12',
    '7c0.moduletemperature13_bms01':'T13',
    '7c0.moduletemperature14_bms01':'T14',
    '7c0.moduletemperature15_bms01':'T15',
    '7c0.moduletemperature16_bms01':'T16',
    '7c0.moduletemperature17_bms01':'T17',
    '7c0.moduletemperature18_bms01':'T18',
    '7c0.moduletemperature19_bms01':'T19',
    '7c0.moduletemperature20_bms01':'T20',
    '7c0.moduletemperature21_bms01':'T21',
    '7c0.moduletemperature22_bms01':'T22',
    '7c0.moduletemperature23_bms01':'T23',
    '7c0.moduletemperature24_bms01':'T24',
    '7c0.moduletemperature25_bms01':'T25',
    '7c0.moduletemperature26_bms01':'T26',
    '7c0.moduletemperature27_bms01':'T27',
    '7c0.moduletemperature28_bms01':'T28',
    '7c0.moduletemperature29_bms01':'T29',
    '7c0.moduletemperature30_bms01':'T30',
    '7c0.moduletemperature31_bms01':'T31',
    '7c0.moduletemperature32_bms01':'T32'
    }, inplace = True)



In [30]:
# test if column names have changed
print(df.columns)

Index(['timestamp', '0.stringstatereq_bms01_bms', 'current',
       '10.linkvoltage_bms01', '10.stringstate_bms01', '20.meansoc_bms01',
       'string_V', '30.maxsoc_bms01', '30.minsoc_bms01',
       '40.dchgintresistance_bms01',
       ...
       'T23', 'T24', 'T25', 'T26', 'T27', 'T28', 'T29', 'T30', 'T31', 'T32'],
      dtype='object', length=259)


In [31]:
# average module temp is only necessary
df['Temp_mean_module1'] = df[['T1', 'T2', 'T3', 'T4']].mean(axis=1)
df['Temp_mean_module2'] = df[['T5', 'T6', 'T7', 'T8']].mean(axis=1)
df['Temp_mean_module3'] = df[['T9', 'T10', 'T11', 'T12']].mean(axis=1)
df['Temp_mean_module4'] = df[['T13', 'T14', 'T15', 'T16']].mean(axis=1)
df['Temp_mean_module5'] = df[['T17', 'T18', 'T19', 'T20']].mean(axis=1)
df['Temp_mean_module6'] = df[['T21', 'T22', 'T23', 'T24']].mean(axis=1)
df['Temp_mean_module7'] = df[['T25', 'T26', 'T27', 'T28']].mean(axis=1)
df['Temp_mean_module8'] = df[['T29', 'T30', 'T31', 'T32']].mean(axis=1)

df.drop(['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12',
            'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23', 'T24',
            'T25', 'T26', 'T27', 'T28', 'T29', 'T30', 'T31', 'T32'], axis=1, inplace=True)


In [32]:
# Keep only necessary columns
df = df[['timestamp', 'current', 'min_SOC', 'cur_integral_module1', 'contactor_state', 'contactor_off_time', 
                 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
                 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
                 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 
                 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39',
                 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49',
                 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59',
                 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69',
                 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79',
                 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89',
                 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96',
                 'Temp_mean_module1', 'Temp_mean_module2', 'Temp_mean_module3', 'Temp_mean_module4',
                 'Temp_mean_module5', 'Temp_mean_module6', 'Temp_mean_module7', 'Temp_mean_module8'
                ]]



In [33]:
# test columns
print(df.head(10))
print('rows, cols: ', df.shape)

                 timestamp  current  min_SOC  cur_integral_module1  \
732498 2022-01-15 20:59:16      7.6    94.92               27200.0   
732499 2022-01-15 20:59:17      7.6    94.92               27200.0   
732500 2022-01-15 20:59:18      7.6    94.92               27200.0   
735207 2022-01-15 20:59:19      7.6    94.93               27200.0   
735208 2022-01-15 20:59:20      7.7    94.93               27200.0   
732503 2022-01-15 20:59:21      7.6    94.93               27200.0   
732504 2022-01-15 20:59:22      7.6    94.93               27200.0   
732505 2022-01-15 20:59:23      7.6    94.93               27200.0   
732506 2022-01-15 20:59:24      7.6    94.94               27200.0   
735213 2022-01-15 20:59:25      7.8    94.94               27300.0   

        contactor_state  contactor_off_time      V1      V2      V3      V4  \
732498              2.0                 0.0  4.1014  4.1011  4.1013  4.1014   
732499              2.0                 0.0  4.1011  4.1010  4.1011  4.

In [34]:
# Remove SNA voltages i.e. interpolate voltages where they are out of limit
# also interpolate SNA current

for i in range(1, constants.NUM_CELLS+1):   # The voltage columns are V1, V2, ..., V96
    df = data_preprocess.interpolate_beyond_limit(df, 'V'+str(i), constants.VOLTAGE_LOWER_LIMIT, constants.VOLTAGE_UPPER_LIMIT)
    df = data_preprocess.interpolate_beyond_limit(df, 'current', constants.CURRENT_LOWER_LIMIT, constants.CURRENT_UPPER_LIMIT)

In [35]:
# test whether SNA removal was successful
column = df["current"].copy()
max_cur = column.max()
min_cur = column.min()
print(min_cur, max_cur)

-267.2 117.9


In [36]:
print('rows, cols: ', df.shape)

rows, cols:  (1027851, 110)


In [37]:
# test max and min voltage
max_volts = []
min_volts = []
for i in range(1, 97):
    i_max = df['V'+str(i)].max()
    i_min = df['V'+str(i)].min()
#     print('V'+str(i)+' max:',  i_max)
#     print('V'+str(i)+' min:',  i_min)
    max_volts.append(i_max)
    min_volts.append(i_min)

print('maximum voltage: ', max(max_volts))
print('maximum voltage: ', min(min_volts))

maximum voltage:  4.1514
maximum voltage:  3.3515


In [38]:
# Fill NAN values
if (df['contactor_state'].isnull().sum() > 0) or (df['current'].isnull().sum() > 0) or (df['V1'].isnull().sum() > 0):
    df.fillna(method="ffill", inplace=True)

In [39]:
# Save valid voltage pkl (df checkpoint)
dir_path = os.path.join(cwd, 'csv')
src_path = os.path.join(dir_path, 'df_phase1_refined.pkl')

df.to_pickle(src_path)