# Preprocess phase 2 data: module-wise collected data

In [1]:
%reload_ext autoreload
# for auto reloading modules without kernel restart
# If this does not work to import custom modules, then restart kernel

In [2]:
# increase default window size for notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

  from IPython.core.display import display, HTML


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import timeit
from dataclasses import dataclass       # C like structure
import glob                             # finds all the pathnames matching specified pattern
import datetime as dt
import random
!python --version
print('pandas version: ' + pd.__version__)
print('numpy version: ' + np.__version__)

Python 3.9.7
pandas version: 1.4.2
numpy version: 1.21.5


In [4]:
import sys, os
cwd = sys.path[0]
sys.path.append(os.path.join(cwd, 'my_modules'))                # sys.path[0] is dir of the ipynb file
import custom_plot
import data_clean
import data_preprocess
import constants

imported custom_plot.py
data_clean imported
data_preprocess imported


In [5]:
%matplotlib notebook
# Plot related packages,%matplotlib notebook makes plots in jupyter interactive

In [8]:
import glob                             # finds all the pathnames matching specified pattern

path = r'csv/module_1' # path for module 1
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    frame = data_clean.get_clean_data(df)
    li.append(frame)

frame = pd.concat(li, axis=0, ignore_index=True)

In [9]:
# remove duplicate entries
frame.drop_duplicates(subset=['timestamp'], inplace=True, keep='first')


### Save joined module 1 data

In [23]:
# save df as pkl file
dir_path = os.path.join(cwd, 'csv', 'module_1')
src_path = os.path.join(dir_path, 'module1_intact.pkl')

frame.to_pickle(src_path)

### Load joined module 1 data

In [49]:
# load from pkl and convert timestamp
dir_path = os.path.join(cwd, 'csv', 'module_1')
src_path = os.path.join(dir_path, 'module1_intact.pkl')

df = pd.read_pickle(src_path)

# Convert timestamp from python obj to datetime and sort
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
df.drop_duplicates(subset=['timestamp'], inplace=True, keep='first')     # remove duplicate entries
df = df.sort_values(by='timestamp')
df['timestamp'] = df['timestamp'].dt.tz_localize(None)          # remove UTC parts

In [50]:
# Test if no duplicates and increasing
print(df['timestamp'].is_monotonic_increasing)
print('rows, cols', df.shape)
print(df.columns)

True
rows, cols (4257220, 37)
Index(['timestamp', '10.current_bms01', '20.stringvoltage_bms01',
       '30.minsoc_bms01', '40.minsoh_bms01',
       '3c0.currentmodule1integral_bms01', '3c0.currentmodule1_bms01',
       '3c0.currentmodule2integral_bms01', '3c0.currentmodule2_bms01',
       '3c0.currentmodule3integral_bms01', '3c0.currentmodule3_bms01',
       '6c0.contactorstate_bms01', '740.sococvinit_bms01',
       '740.ctrofftime_bms01', '740.lifetime_bms01', '740.minsoc_bms01',
       '740.sleeptime_bms01', '780.bmsonboardtemperature_bms01',
       '780.cmbonboardtemperature01_bms01', '7c0.brickvoltage001_bms01',
       '7c0.brickvoltage002_bms01', '7c0.brickvoltage003_bms01',
       '7c0.brickvoltage004_bms01', '7c0.brickvoltage005_bms01',
       '7c0.brickvoltage006_bms01', '7c0.brickvoltage007_bms01',
       '7c0.brickvoltage008_bms01', '7c0.brickvoltage009_bms01',
       '7c0.brickvoltage010_bms01', '7c0.brickvoltage011_bms01',
       '7c0.brickvoltage012_bms01', '7c0.moduletemp

In [51]:
# Change column names, remove the can id from the begining
# print(df.columns)

df.rename(columns = {
    '10.current_bms01':'current',
    '20.stringvoltage_bms01':'string_V',
    '740.minsoc_bms01':'min_SOC',
    '40.minsoh_bms01':'min_SOH',
    '3c0.currentmodule1integral_bms01':'cur_integral_module1',
    '3c0.currentmodule1_bms01':'cur_module1',
    '6c0.contactorstate_bms01':'contactor_state',
    '740.ctrofftime_bms01':'contactor_off_time',
    '7c0.brickvoltage001_bms01':'V1',
    '7c0.brickvoltage002_bms01':'V2',
    '7c0.brickvoltage003_bms01':'V3',
    '7c0.brickvoltage004_bms01':'V4',
    '7c0.brickvoltage005_bms01':'V5',
    '7c0.brickvoltage006_bms01':'V6',
    '7c0.brickvoltage007_bms01':'V7',
    '7c0.brickvoltage008_bms01':'V8',
    '7c0.brickvoltage009_bms01':'V9',
    '7c0.brickvoltage010_bms01':'V10',
    '7c0.brickvoltage011_bms01':'V11',
    '7c0.brickvoltage012_bms01':'V12',
    '7c0.moduletemperature01_bms01':'T1',
    '7c0.moduletemperature02_bms01':'T2',
    '7c0.moduletemperature03_bms01':'T3',
    '7c0.moduletemperature04_bms01':'T4',
    }, inplace = True)



In [52]:
# average module temp is only necessary
df['Temp_mean'] = df[['T1', 'T2', 'T3', 'T4']].mean(axis=1)

### Save renamed module 1 data

In [54]:
# Save df with average temperature
dir_path = os.path.join(cwd, 'csv', 'module_1')
src_path = os.path.join(dir_path, 'module1_updated_cols.pkl')

df.to_pickle(src_path)

### Load data

In [56]:
# load last saved df and convert time stamp and sort
path = os.path.join(sys.path[0], 'csv/module_1')
df = pd.read_pickle(path + '/module1_updated_cols.pkl')
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
df = df.sort_values(by='timestamp')
df['timestamp'] = df['timestamp'].dt.tz_localize(None)          # remove UTC parts

In [57]:
# remove all unnecessary columns
df.drop(['T1', 'T2', 'T3', 'T4', 'string_V', '30.minsoc_bms01', 'min_SOH', 'cur_integral_module1',
        'cur_module1', '3c0.currentmodule2integral_bms01', '3c0.currentmodule2_bms01', 
         '3c0.currentmodule3_bms01', '740.sococvinit_bms01', 'contactor_off_time',
         '740.lifetime_bms01', '740.sleeptime_bms01', '780.bmsonboardtemperature_bms01',
         '780.cmbonboardtemperature01_bms01', '740.invalidmoduletempflags_bms01', 
         '740.invalidmodulevoltageflags_bms01', '3c0.currentmodule3integral_bms01'], axis=1, inplace=True)

In [58]:
# Remove SNA voltages i.e. interpolate voltages where they are out of limit, also for current

NUM_CELLS_MODULE_1 = 12
for i in range(1, NUM_CELLS_MODULE_1 + 1):   # The voltage columns are V1, V2, ..., V96
    df = data_preprocess.interpolate_beyond_limit(df, 'V'+str(i), constants.VOLTAGE_LOWER_LIMIT, constants.VOLTAGE_UPPER_LIMIT)
df = data_preprocess.interpolate_beyond_limit(df, 'current', constants.CURRENT_LOWER_LIMIT, constants.CURRENT_UPPER_LIMIT)

In [61]:
# test whether SNA removal was successful
col_cur = df["current"].copy()
col_volt = df["V1"].copy()
max_cur = col_cur.max()
min_cur = col_cur.min()
max_volt = col_volt.max()
min_volt = col_volt.min()
print(min_cur, max_cur, max_volt, min_volt)
print('rows, cols: ', df.shape)

-274.2 139.3 4.1526 3.2431
rows, cols:  (4257220, 17)


In [62]:
df.columns

Index(['timestamp', 'current', 'contactor_state', 'min_SOC', 'V1', 'V2', 'V3',
       'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'Temp_mean'],
      dtype='object')

In [64]:
# Fill NAN values
if (df['contactor_state'].isnull().sum() > 0) or (df['current'].isnull().sum() > 0) or (df['V1'].isnull().sum() > 0):
    df.fillna(method="ffill", inplace=True)

### Save refined data for module 1

In [67]:
# Save valid voltage pkl (df checkpoint)
dir_path = os.path.join(cwd, 'csv', 'module_1')
src_path = os.path.join(dir_path, 'df_phase2_refined_mod1.pkl')

df.to_pickle(src_path)

In [70]:
# load last saved df and convert time stamp and sort
dir_path = os.path.join(cwd, 'csv', 'module_1')
src_path = os.path.join(dir_path, 'df_phase2_refined_mod1.pkl')
# print(src_path)

df = pd.read_pickle(src_path)
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
df = df.sort_values(by="timestamp")
df['timestamp'] = df['timestamp'].dt.tz_localize(None)          # remove UTC parts, i.e. remove all parts after seconds
df = add_elapsed_sec(df, 'timestamp')

NameError: name 'add_elapsed_sec' is not defined

In [326]:
# print(df['elapsed_sec'])
# print(df['elapsed_sec'].dtype)

In [262]:
# print(df.head(5))
print(df['timestamp'].duplicated().sum())   # print num of duplicates if any

seconds = df.groupby(['timestamp']).cumcount()
for s in seconds:
    if (s > 0) :
        print(seconds)

0


In [343]:
# test split_on_time_gap (10 second)
  = get_ts_records_by_timegap(df, 'timestamp', 1) # rets tuple of dfs
print(len(tu_time_divided_ts))

1259


In [260]:
# li_continuous_series contains many chunks of time series as df
# Each df in li_continuous_series has 1 Hz data collection rate
# As my supervised learning is sequential, it is necessary to separate if there is missing data


In [345]:
# Static voltage algorithm will be applied to each df of list

li_valid_cycles = []

for i_list, df_x in enumerate(tu_time_divided_ts):
    df_cntr_off = df_x.loc[df_x['contactor_state'].astype('int64') == constants.CONTACTOR_OFF]
    df_cntr_on = df_x.loc[df_x['contactor_state'].astype('int64') == constants.CONTACTOR_ON]

    if ((df_cntr_off.shape[0] > 0) and (df_cntr_on.shape[0] > 0)):
        li_valid_cycles.append(df_x.copy())
        
#     if(df_cntr_off.shape[0] > 0):
#         df_const_V = get_const_volt_df(df_cntr_off, 'V1', 'timestamp')
        
print("num_valid_dfs: " + str(len(li_valid_cycles)))

# df_test = tu_time_divided_ts[25].copy()           # must copy, otherwise the original df will change
# print(li_valid_cycles)

num_valid_dfs: 212


In [25]:
# Testing

df_test['timestamp'] = df_test['timestamp'].sort_values()

df_test['delta'] = (df['timestamp'] - df['timestamp'].iloc[0]).dt.seconds
df_test['cur_del'] = (df['current'] - df['current'].iloc[0])

# test time diff
# start_time = df['timestamp'].iloc[0]
# end_time = df['timestamp'].iloc[1]


NameError: name 'df_test' is not defined

In [348]:
# add a column to valid cycles that has time in sec (starts from zero sec)
li_valid_cycles = [add_elapsed_sec(each_df, 'timestamp') for each_df in li_valid_cycles]

In [349]:
import random

# Take randomly 10 dfs from list and plot them
len_valid_cycle = len(li_valid_cycles)
random.seed(35)           # seed for regenerating, any number for seed
li_rand1_range = random.sample(range(len_valid_cycle), 10)

random.seed(2021)
li_rand2_range = random.sample(range(len_valid_cycle), 10)

random.seed(100)
li_rand3_range = random.sample(range(len_valid_cycle), 10)

print(li_rand1_range)
print(li_rand2_range)
print(li_rand3_range)

[140, 85, 192, 33, 191, 87, 39, 73, 110, 64]
[103, 161, 139, 70, 63, 162, 8, 113, 121, 147]
[37, 117, 116, 197, 44, 180, 100, 187, 89, 110]


In [311]:
# print(li_valid_cycles[162])

In [320]:
# Shows double values along y axis

%matplotlib notebook

df_temp = li_valid_cycles[li_rand2_range[4]].copy()

df_limit = df_temp[(df_temp['elapsed_sec'] > 110 * 60) & (df_temp['elapsed_sec'] < 120 * 60)].copy()

# print(df_limit ['V1'])

print(df_limit['V1'].isnull().sum())

# plot_xy((df_limit['elapsed_sec'], "Time (s)"), (df_limit['V1'].to_numpy(dtype=np.float64), "V1"))

0


In [322]:
%matplotlib notebook
df_temp = li_valid_cycles[li_rand3_range[9]].copy()
# plot_xy((df_temp['elapsed_sec'], "Time (s)"), (df_temp['V1'], "V1"))

In [342]:
# print(df['timestamp'].is_monotonic_increasing)
# print(df['elapsed_sec'].is_monotonic_increasing)

In [356]:
%matplotlib notebook
import matplotlib
df_plot = df.copy()
# print(df_plot.head(100))
# print(df.head(100))
plot_xy((df['elapsed_sec'], "Time (s)"), (df['V1'], "V1"))

<IPython.core.display.Javascript object>

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


In [357]:
print(df.head(100))

                 timestamp  current  contactor_state  min_SOC      V1      V2  \
751358 2021-11-11 18:21:43     -0.6              2.0    64.18  3.7680  3.7698   
751359 2021-11-11 18:21:44     -0.6              2.0    64.18  3.7679  3.7698   
751360 2021-11-11 18:21:45     -0.6              2.0    64.18  3.7680  3.7698   
751361 2021-11-11 18:21:46     -0.6              2.0    64.18  3.7680  3.7698   
751362 2021-11-11 18:21:47     -0.6              2.0    64.18  3.7680  3.7698   
...                    ...      ...              ...      ...     ...     ...   
751453 2021-11-11 18:23:18     -0.6              2.0    64.17  3.7690  3.7708   
751454 2021-11-11 18:23:19     -0.6              2.0    64.17  3.7690  3.7707   
751455 2021-11-11 18:23:20     -0.6              2.0    64.17  3.7690  3.7707   
751456 2021-11-11 18:23:21     -0.5              2.0    64.17  3.7690  3.7708   
751457 2021-11-11 18:23:22     -0.6              2.0    64.16  3.7690  3.7708   

            V3      V4     

In [359]:
print(constants.STATIC_VOLTAGE_THRESHOLD)
print(constants.STATIC_VOLTAGE_MIN_DURATION_SEC)

df_temp = li_valid_cycles[li_rand3_range[7]].copy()
df_const_V = get_const_volt_df(df_temp, 'V1', 'timestamp')

# print(df_temp.head(10))
print(df_const_V)

0.003
600
get_groups_gt_static_duration called
                   begin_time            end_time   min_V   max_V  Consecutive
value_grp                                                                     
0         2022-03-02 16:20:11 2022-03-02 16:51:05  3.8454  3.8470         1855
1         2022-03-02 16:51:06 2022-03-02 17:59:01  3.8542  3.8563         4076
19        2022-03-02 18:31:14 2022-03-02 20:03:04  3.8528  3.8556         5511
18        2022-03-02 17:59:58 2022-03-02 18:31:13  3.8489  3.8518         1876


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  li_input_dfs[i_list].drop(li_input_dfs[i_list].index[0:split_iloc_id], axis=0, inplace=True)


In [None]:


# remember the index of list where contactor were both on and off

# Take randomly 10 cntr off dfs to see after how long stops

In [None]:
# Code Sayan

plt.savefig('/content/drive/MyDrive/MA/Plots/Fan_Good_bad.pdf',bbox_inches='tight', format='pdf', dpi=1000)
zuk-cyuv-taa

In [None]:
# Sayan broadband stat
def calc_feature(signal,feature):
    if feature == "power":
        coefficient = 20 * np.log10(np.std(signal) / 2e-5)
    elif feature == "crest":
        rms = np.sqrt(np.mean(np.square(signal)))
        coefficient = max(signal) / rms
    elif feature == "shape":
        rms = np.sqrt(np.mean(np.square(signal)))
        coefficient = rms / np.mean(abs(signal))
    elif feature == "rms":
        coefficient = np.sqrt(np.mean(np.square(signal)))
    elif feature == "var":
        coefficient = np.var(signal)
    elif feature == "skewness":
        coefficient =  skew(signal)
    elif feature == "kurtosis":
        coefficient = kurtosis(signal)
    return coefficient


In [70]:
%matplotlib notebook
plot_xy((df_cntr_on['timestamp'], "Time (s)"), (df_cntr_on['V1'], "V1"))

<IPython.core.display.Javascript object>

In [28]:

df_cntr_off = df

Unnamed: 0,timestamp,current,contactor_state,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,Temp_mean
0,2021-11-13 11:18:52+00:00,-0.5,2.0,3.8489,3.8498,3.8491,3.8496,3.85,3.8491,3.8491,3.8493,3.8499,3.8497,3.8487,3.8488,9.0
1,2021-11-13 11:18:53+00:00,-0.7,2.0,3.849,3.8499,3.8492,3.8497,3.85,3.8492,3.8491,3.8493,3.85,3.8498,3.8488,3.8488,9.0
2,2021-11-13 11:18:54+00:00,-1.2,2.0,3.8482,3.8492,3.8485,3.8489,3.8492,3.8485,3.8483,3.8486,3.8493,3.849,3.848,3.8482,9.0
3,2021-11-13 11:18:55+00:00,-1.3,2.0,3.848,3.849,3.8484,3.8488,3.8491,3.8484,3.8482,3.8484,3.8491,3.8488,3.8478,3.848,9.0
4,2021-11-13 11:18:56+00:00,-1.4,2.0,3.8478,3.8487,3.8481,3.8485,3.8488,3.8481,3.8479,3.8482,3.8489,3.8486,3.8476,3.8477,9.0


In [197]:
# This is just for test purpose

df1 = pd.DataFrame([[10, 20], [30, 40]], columns=list('AB'), index=['x', 'y'])
df2 = pd.DataFrame([["1", "2"], ["3", "4"]], columns=list('AB'), index=[1, 2])

df3 = pd.concat([df1, df2])

# li2 = (np.where(df['A']  < 5, True, False))
print(df1)

print(df3)

    A   B
x  10  20
y  30  40
    A   B
x  10  20
y  30  40
1   1   2
2   3   4


In [198]:
# test code, time series shift to supervised learning

df = pd.DataFrame()
df['t'] = [x for x in range(10)]
df['t-1'] = df['t'].shift(-1)
print(df)

   t  t-1
0  0  1.0
1  1  2.0
2  2  3.0
3  3  4.0
4  4  5.0
5  5  6.0
6  6  7.0
7  7  8.0
8  8  9.0
9  9  NaN


In [31]:
# test
df = pd.DataFrame({'A': [1, 1, 2, 2],
                    'B': [1, 2, 3, 4],
                    'C': np.random.randn(4)})

print(df)
df.groupby('A').agg(list)

   A  B         C
0  1  1 -1.399397
1  1  2 -0.868961
2  2  3 -0.482514
3  2  4  0.608612


Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[1, 2]","[-1.3993970984797715, -0.8689606905779259]"
2,"[3, 4]","[-0.48251423343760474, 0.6086123263838642]"
