In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

from scipy import signal
import pywt

import os
import time
import datetime
import random
import h5py

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
keras = tf.keras
from tensorflow.keras import datasets, layers, models
import tensorflow_io as tfio
from tensorflow.keras.callbacks import History

from platform import python_version
print(python_version())

3.8.8


In [2]:
def readData(accDir, annotFile):
    files = os.listdir(accDir)
    files_csv = [f for f in files if f[-3:] == 'csv']
    empatica_dict = dict()
    for f in files_csv:
        data = np.genfromtxt(accDir+f, delimiter=',') # creates numpy array for each Empatica acc csv file
        key = int(float(f.strip("ACC.csv")))
        empatica_dict[key] = data
    tmp = pd.read_excel(annotFile, sheet_name=None)
    annot_dict = dict(zip(tmp.keys(), [i.dropna() for i in tmp.values()])) # Remove the rows with NaN values (some with ladder 2 missing)
    return empatica_dict, annot_dict

def getLabeledDict(empatica_dict, annot_dict, subject_ids):
    labeled_dict = {}; taskInd_dict = {}
    for id in subject_ids:
        start_time = int(empatica_dict[id][0,0])
        acc = empatica_dict[id][2:,:]
        label = list(map(lambda i: i.replace("_end", "").replace("_start", ""), annot_dict['P'+ str(id)].taskName.tolist()))
        task_time= list(map(lambda i: time.mktime(datetime.datetime.strptime(i[:6] + '20' + i[6:], "%m/%d/%Y %H:%M:%S").timetuple()),
                            annot_dict['P'+ str(id)].startTime_global.tolist()))
        task_ind = [int(x - start_time)*SR for x in task_time]
        taskInd_dict[id] = task_ind
        label_tmp = np.empty(acc.shape[0], dtype=object)
        for i, (j, k) in enumerate(zip(task_ind[0::2], task_ind[1::2])):
            tmpInd = 2*i
            label_tmp[j:k] = label[tmpInd]
        acc_mag = np.sqrt(np.sum(acc**2, axis=1))[:,None]
        accel = np.hstack((acc, acc_mag))
        labeled_dict[id] = pd.DataFrame(np.hstack((accel, label_tmp.reshape(label_tmp.shape[0],1))), columns=['X', 'Y', 'Z', 'Mag', 'label'])
    return labeled_dict, taskInd_dict

In [3]:
sepAccDict, sepAnnotDict = readData(accDir='./Data/50_subs/Acc Data/separate/', annotFile='./Data/50_subs/Annotation Data/separate.xlsx')
SR=int(sepAccDict[8][1,0])

sepSubIDs = list(range(8,45))
# sepSubIDs.remove(27) # does not have lift
sepLabeledDict_, sepTaskIndDict = getLabeledDict(sepAccDict, sepAnnotDict, sepSubIDs)

## Apply Low Pass Filter

In [4]:
# Apply Filter on All Subjects
n=4; fc=2; w=fc/(SR/2)
b, a = signal.butter(n, w, 'low')
sepLabeledDict_filtered = dict(map(lambda key: (key, signal.filtfilt(b, a, x=sepLabeledDict_[key].drop(columns='label'), axis=0)), sepLabeledDict_.keys()))
# back to DF and add label
sepLabeledDict_filtered_dfs = dict(map(lambda key: (
                                                        key, pd.DataFrame(sepLabeledDict_filtered[key],columns=['X', 'Y', 'Z', 'Mag']).assign(label=sepLabeledDict_[key].label)
                                                    ), sepLabeledDict_filtered.keys()))
# Remove data without label
filt_noNA_dict = dict(map(lambda key: (key, sepLabeledDict_filtered_dfs[key].dropna()), sepLabeledDict_filtered_dfs.keys()))

In [5]:
winLen = 320
window_dict = {}
label_dict = {}
for key in filt_noNA_dict.keys():
    window_list = []
    labels=[]
    for g1, df1 in filt_noNA_dict[key].groupby('label'):
        for g2, df2 in df1.groupby(np.arange(df1.shape[0]) // winLen):
            if df2.shape[0]==winLen:
                window_list.append(df2.drop(columns=['Mag', 'label']))
                labels.append(g1)
    window_dict[key] = np.array(window_list)
    label_dict[key] = labels

In [6]:
type(window_dict[8]), window_dict[8].shape

(numpy.ndarray, (187, 320, 3))

# Train Test Split

In [10]:
random.seed(2021)
percentTrain = 80
all_subs = list(label_dict.keys())
train_subs = random.sample(all_subs, k=int(len(all_subs)*(percentTrain/100)))
test_subs = list(set(all_subs) - set(train_subs))

train_array_list = [window_dict[key] for key in train_subs]
test_array_list = [window_dict[key] for key in test_subs]
train_np = np.concatenate(train_array_list)
test_np = np.concatenate(test_array_list)

train_label__ = [label_dict[key] for key in train_subs]
train_label_ = [item for sublist in train_label__ for item in sublist]
train_label = [item.replace('1', '').replace('2', '') for item in train_label_]
test_label__ = [label_dict[key] for key in test_subs]
test_label_ = [item for sublist in test_label__ for item in sublist]
test_label = [item.replace('1', '').replace('2', '') for item in test_label_]

In [8]:
len(train_subs), len(test_subs), len(all_subs)

(29, 8, 37)

In [9]:
print(sorted(train_subs))
print(sorted(test_subs))
print(sorted(all_subs))

[9, 10, 11, 12, 13, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 28, 30, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
[8, 14, 19, 20, 21, 29, 31, 32]
[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]


In [7]:
# random.seed(2021)
# percent_split = 80
# all_subs = list(label_dict.keys())
# non_test_subs = random.sample(all_subs, k=int(len(all_subs)*(percent_split/100)))
# test_subs = list(set(all_subs) - set(non_test_subs))
# train_subs = random.sample(non_test_subs, k=int(len(non_test_subs)*(percent_split/100)))
# val_subs = list(set(non_test_subs) - set(train_subs))

# with open('test_subs.pickle', 'wb') as outfile:
#     pickle.dump(test_subs, outfile)

# train_array_list = [window_dict[key] for key in train_subs]
# val_array_list = [window_dict[key] for key in val_subs]
# train_np = np.concatenate(train_array_list)
# val_np = np.concatenate(val_array_list)

# train_label__ = [label_dict[key] for key in train_subs]
# train_label_ = [item for sublist in train_label__ for item in sublist]
# train_label = [item.replace('1', '').replace('2', '') for item in train_label_]
# val_label__ = [label_dict[key] for key in val_subs]
# val_label_ = [item for sublist in val_label__ for item in sublist]
# val_label = [item.replace('1', '').replace('2', '') for item in val_label_]

In [12]:
type(train_label__), len(train_label__), type(train_label__[0]), len(train_label__[0])

(list, 28, list, 232)

In [14]:
type(train_label_), len(train_label_)

(list, 6070)

In [15]:
set(train_label_)

{'electricPanel',
 'hoist',
 'ladder1',
 'ladder2',
 'lift',
 'overhead',
 'push',
 'sit',
 'stand',
 'type',
 'walk'}

In [16]:
set(train_label)

{'electricPanel',
 'hoist',
 'ladder',
 'lift',
 'overhead',
 'push',
 'sit',
 'stand',
 'type',
 'walk'}

In [8]:
train_np.shape

(6070, 320, 3)

# Extract Continuous Wavelet Features

In [7]:
# start_time = time.time()

# SR = 1/32
# scales = range(1,winLen)
# waveletname = 'morl'

# with h5py.File('data1.hdf5', 'w') as hf:
#     hf.create_dataset('data_train', (train_np.shape[0], winLen-1, winLen-1, train_np.shape[2]), np.float)
#     for i in range(train_np.shape[0]):
#         if i % 1000 == 0:
#             print(i)
#         for j in range(train_np.shape[2]):
#             sig = train_np[i,:,j]
#             coeff, freq = pywt.cwt(sig, scales, waveletname, SR)
#             hf["data_train"][i, :, :, j] = coeff[:, :-1]
            
#     hf.create_dataset('data_test', (test_np.shape[0], winLen-1, winLen-1, test_np.shape[2]), np.float)
#     for i in range(test_np.shape[0]):
#         if i % 1000 == 0:
#             print(i)
#         for j in range(test_np.shape[2]):
#             sig = test_np[i,:,j]
#             coeff, freq = pywt.cwt(sig, scales, waveletname, SR)
#             hf["data_test"][i, :, :, j] = coeff[:, :-1]
            
# print('elapsed time = {}'.format(time.time() - start_time))

0
1000
2000
3000
4000
5000
6000
0
1000
elapsed time = 1542.5610084533691


In [21]:
y_train_integer_encoded = LabelEncoder().fit_transform(train_label)
y_test_integer_encoded = LabelEncoder().fit_transform(test_label)
y_train = keras.utils.to_categorical(y_train_integer_encoded, 10)
y_test = keras.utils.to_categorical(y_test_integer_encoded, 10)
# with h5py.File('data_label.hdf5', 'w') as hf:
#     hf['data_train'] = y_train
#     hf['data_test'] = y_test

In [53]:
print(set(train_label))
print(set(test_label))

{'lift', 'stand', 'type', 'walk', 'overhead', 'push', 'electricPanel', 'ladder', 'hoist', 'sit'}
{'lift', 'stand', 'type', 'walk', 'overhead', 'push', 'electricPanel', 'ladder', 'hoist', 'sit'}


In [35]:
aa = ['1', '2', '3','3', '4', '5']

aa_encoded = LabelEncoder().fit_transform(aa)
aa_encoded

array([0, 1, 2, 2, 3, 4], dtype=int64)

In [41]:
bb = ['3', '4', '5', '1', 'blah', 'a', '0', 'blah2', 23, 3, 1]
bb_encoded = LabelEncoder().fit_transform(bb)
bb_encoded

array([3, 4, 5, 1, 7, 6, 0, 8, 2, 3, 1], dtype=int64)

In [50]:
keras.utils.to_categorical(bb_encoded, len(bb_encoded))

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [49]:
len(np.unique(bb_encoded))

9

In [23]:
y_train.shape, len(train_label)

((6070, 10), 6070)

In [26]:
type(y_train_integer_encoded), y_train_integer_encoded.shape
y_train_integer_encoded

array([0, 0, 0, ..., 9, 9, 9], dtype=int64)

In [28]:
pd. DataFrame(y_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
6065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [66]:
h = h5py.File('data1.hdf5', 'r')
aa_ = h.get('data_train')
aa = aa_[:,:,:,:]
# train_h5 = h['data_train']
test_h5 = h.get('data_test')
# test_h5 = h['data_test']
h.close()

In [69]:
type(aa_), aa_.shape

ValueError: Not a dataset (not a dataset)

In [68]:
del aa

In [63]:
aa[0,0,10,2]

-0.08871858628964163

In [35]:
train_h5.shape

(6070, 319, 319, 3)

In [38]:
aa = np.zeros(shape=(train_np.shape[0], winLen-1, winLen-1, train_np.shape[2]), dtype=np.float)

In [40]:
aa.dtype

dtype('float64')

In [None]:
# f, ax = plt.subplots(1,figsize=(7,7))
# ax.imshow(train_h5[0,:,:,:])#, cmap=plt.cm.seismic)

# CNN Tensorflow

In [41]:
h = h5py.File('data1.hdf5', 'r')
train_h5 = h.get('data_train')
img_x = train_h5.shape[1]
img_y = train_h5.shape[2]
img_z = train_h5.shape[3]
input_shape = (img_x, img_y, img_z)
# h.close()

In [46]:
inputshape = train_h5.shape[1:]
inputshape

(319, 319, 3)

In [47]:
input_shape

(319, 319, 3)

In [14]:
history = History()

h = h5py.File('data1.hdf5', 'r')
train_h5 = h.get('data_train')
img_x = train_h5.shape[1]
img_y = train_h5.shape[2]
img_z = train_h5.shape[3]
input_shape = (img_x, img_y, img_z)
h.close()

num_classes = len(set(train_label))
batch_size = 20
epochs = 10

x_train = tfio.IODataset.from_hdf5('data1.hdf5', dataset='/data_train')
x_test = tfio.IODataset.from_hdf5('data1.hdf5', dataset='/data_test')

y_train = tfio.IODataset.from_hdf5('data_label.hdf5', dataset='/data_train')
y_test = tfio.IODataset.from_hdf5('data_label.hdf5', dataset='/data_test')

train = tf.data.Dataset.zip((x_train,y_train)).batch(batch_size, drop_remainder=True)#.prefetch(tf.data.experimental.AUTOTUNE)
val = tf.data.Dataset.zip((x_test,y_test)).batch(batch_size, drop_remainder=True)#.prefetch(tf.data.experimental.AUTOTUNE)

model = models.Sequential()
model.add(layers.Conv2D(32, kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=input_shape))
model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(layers.Conv2D(64, (5, 5), activation='relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(num_classes, activation='softmax'))

In [15]:
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 315, 315, 32)      2432      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 157, 157, 32)      0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 153, 153, 64)      51264     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 76, 76, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 369664)            0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               36966500  
_________________________________________________________________
dense_3 (Dense)              (None, 10)               

In [16]:
model.fit(train, epochs=epochs, validation_data=val, verbose=1, callbacks=[history])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x199e4c4f8b0>

## Try to Avert Overfitting 1

In [17]:
history = History()

h = h5py.File('data1.hdf5', 'r')
train_h5 = h.get('data_train')
img_x = train_h5.shape[1]
img_y = train_h5.shape[2]
img_z = train_h5.shape[3]
input_shape = (img_x, img_y, img_z)
h.close()

num_classes = len(set(train_label))
batch_size = 20
epochs = 10

x_train = tfio.IODataset.from_hdf5('data1.hdf5', dataset='/data_train')
x_test = tfio.IODataset.from_hdf5('data1.hdf5', dataset='/data_test')

y_train = tfio.IODataset.from_hdf5('data_label.hdf5', dataset='/data_train')
y_test = tfio.IODataset.from_hdf5('data_label.hdf5', dataset='/data_test')

train = tf.data.Dataset.zip((x_train,y_train)).batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
val = tf.data.Dataset.zip((x_test,y_test)).batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

model = models.Sequential()
model.add(layers.Conv2D(32, kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=input_shape))
model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(layers.Conv2D(64, (5, 5), activation='relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(num_classes, activation='softmax'))

In [None]:
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
model.summary()

In [19]:
model.fit(train, epochs=epochs, validation_data=val, verbose=1, callbacks=[history])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x199e7f95f10>

## 2

In [20]:
history = History()

h = h5py.File('data1.hdf5', 'r')
train_h5 = h.get('data_train')
img_x = train_h5.shape[1]
img_y = train_h5.shape[2]
img_z = train_h5.shape[3]
input_shape = (img_x, img_y, img_z)
h.close()

num_classes = len(set(train_label))
batch_size = 20
epochs = 10

x_train = tfio.IODataset.from_hdf5('data1.hdf5', dataset='/data_train')
x_test = tfio.IODataset.from_hdf5('data1.hdf5', dataset='/data_test')

y_train = tfio.IODataset.from_hdf5('data_label.hdf5', dataset='/data_train')
y_test = tfio.IODataset.from_hdf5('data_label.hdf5', dataset='/data_test')

train = tf.data.Dataset.zip((x_train,y_train)).batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
val = tf.data.Dataset.zip((x_test,y_test)).batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

model = models.Sequential()
model.add(layers.Conv2D(32, kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=input_shape))
model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(layers.Conv2D(64, (5, 5), activation='relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(num_classes, activation='softmax'))

In [71]:
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(), metrics=['AUC'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 315, 315, 32)      2432      
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 157, 157, 32)      0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 153, 153, 64)      51264     
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 76, 76, 64)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 369664)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               36966500  
_________________________________________________________________
dense_7 (Dense)              (None, 10)               

In [72]:
model.fit(train, epochs=epochs, validation_data=val, verbose=1)#, callbacks=[history])

Epoch 1/10
      8/Unknown - 21s 2s/step - loss: 0.1996 - auc: 0.9970

KeyboardInterrupt: 

In [32]:
model.save('./tf_model/')

INFO:tensorflow:Assets written to: ./tf_model/assets
