In [1]:
import warnings
import os
import glob
import numpy as np
import pandas as pd

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# function to check the missing value
def findMissingData(df):
    total = df.isnull().sum()
    percent = total/df.shape[0]
    new_df = pd.concat([total,percent], axis = 1, keys = ['total','percent'])
    t = []
    types = df.dtypes
    for i in types:
        t.append(i)
    new_df['type'] = t
    new_df = np.transpose(new_df)
    return new_df

In [4]:
target_dir = os.path.join(os.getcwd(), 'Raw_data') # path for the raw data file folder
tag = 'tag'
ibi = 'IBI'
l = {}
col = [['ACC_X','ACC_Y', 'ACC_Z'], ['BVP'], ['EDA'], ['HR'], ['Time_Interval', 'IBI'], ['TEMP']]
n2 = 2
freq = [32, 64, 4, 1,'unknown', 4]
for i in os.listdir(target_dir):
    n1 = 1
    data_folder = os.path.join(target_dir, i) #  path for each participant folder
    if os.path.isdir(data_folder):
        target_file = glob.glob(os.path.join(data_folder, '*.csv')) # find csv file
        target_file = [file for file in target_file if tag not in os.path.basename(file)] # ignore the tag csv
        for j in target_file:
            if ibi in os.path.basename(j):
                data = pd.read_csv(j, skiprows = 0)
            else:
                data = pd.read_csv(j, skiprows = 1)
                data = data.groupby(data.index//freq[n1-1])[data.columns].mean() # avg the data
            data.columns = col[n1-1]
            if n2 in l:
                l[n2].append(data)
            else:
                l[n2] = []
                l[n2].append(data)
            n1 += 1
        n2 += 1

In [5]:
def structure(n):
    k = 0
    for i in l[n]:
        print(col[k],i.shape)
        k += 1

In [6]:
# Find the pattern for based on HR by looking at the data structure for 3 participants
# The data in raw data has been averaged, which means they are in the same frequency(1 hz)
# total = end - start + 1 
# null = n
# formula = total - null
print(f'The data structure in averaged raw data for participant {2}:')
structure(3)
print(f'The number of HR data in preprocessed data for participant {2}: {6867-3559 + 1 - 1}') # contains 1 missing value
print()
print(f'The data structure in averaged raw data for participant {19}:')
structure(19)
print(f'The number of HR data in preprocessed data for participant {19}: {59988-56804 + 1 - 2}') # contains 2 missing value
print()
print(f'The data structure in averaged raw data for participant {20}:')
structure(20)
print(f'The number of HR data in preprocessed data for participant {20}: {63376-59989 + 1 - 2}') # contains 2 missing value

The data structure in averaged raw data for participant 2:
['ACC_X', 'ACC_Y', 'ACC_Z'] (3319, 3)
['BVP'] (3318, 1)
['EDA'] (3317, 1)
['HR'] (3308, 1)
['Time_Interval', 'IBI'] (1290, 2)
['TEMP'] (3320, 1)
The number of HR data in preprocessed data for participant 2: 3308

The data structure in averaged raw data for participant 19:
['ACC_X', 'ACC_Y', 'ACC_Z'] (3194, 3)
['BVP'] (3194, 1)
['EDA'] (3194, 1)
['HR'] (3183, 1)
['Time_Interval', 'IBI'] (1530, 2)
['TEMP'] (3192, 1)
The number of HR data in preprocessed data for participant 19: 3183

The data structure in averaged raw data for participant 20:
['ACC_X', 'ACC_Y', 'ACC_Z'] (3396, 3)
['BVP'] (3397, 1)
['EDA'] (3398, 1)
['HR'] (3386, 1)
['Time_Interval', 'IBI'] (1200, 2)
['TEMP'] (3394, 1)
The number of HR data in preprocessed data for participant 20: 3386


In [7]:
preprocess_data = pd.read_csv('Improved_All_Combined_hr_rsp_binary.csv')

In [8]:
findMissingData(preprocess_data)

Unnamed: 0,Participant,HR,respr,Time(sec),Label
total,0,44,0,0,0
percent,0.0,0.000391,0.0,0.0,0.0
type,int64,float64,float64,int64,int64


In [9]:
# null value index
preprocess_data[preprocess_data.isnull().any(axis=1)].index

Index([  3555,   3556,   6865,  10395,  10396,  13665,  16975,  20346,  23480,
        23481,  23482,  26659,  26660,  29689,  32959,  36292,  39629,  43196,
        46646,  50083,  50084,  53387,  56800,  56801,  59985,  59986,  63373,
        63374,  66592,  69647,  72727,  75996,  79200,  82431,  85801,  89099,
        92385,  95713,  99011,  99012, 102211, 105438, 105439, 109000],
      dtype='int64')

In [10]:
preprocess_data.shape

(112516, 5)

In [11]:
preprocess_data.dropna(inplace = True)
preprocess_data.reset_index(drop = True, inplace = True)

In [12]:
preprocess_data.shape

(112472, 5)

In [13]:
# remove the extra rows in dataframe
for i in l:
    for j in range(6):
        if j < 3 or j > 4:
            l[i][j] = l[i][j].iloc[:len(l[i][3])]

In [14]:
structure(19)

['ACC_X', 'ACC_Y', 'ACC_Z'] (3183, 3)
['BVP'] (3183, 1)
['EDA'] (3183, 1)
['HR'] (3183, 1)
['Time_Interval', 'IBI'] (1530, 2)
['TEMP'] (3183, 1)


In [15]:
# remove IBI data
for i in l:
    del l[i][4]

In [16]:
structure(19)

['ACC_X', 'ACC_Y', 'ACC_Z'] (3183, 3)
['BVP'] (3183, 1)
['EDA'] (3183, 1)
['HR'] (3183, 1)
['Time_Interval', 'IBI'] (3183, 1)


In [17]:
# concat, axis = 1
concat_data = []
for i in l:
    concat_data.append(pd.concat(l[i], axis = 1))

In [18]:
len(concat_data)

34

In [19]:
row_number = 0
for i in concat_data:
    row_number += i.shape[0]

In [20]:
row_number # the total row number of averaged raw data

112470

In [21]:
preprocess_data.shape[0] # the total row number of preprocess data

112472

In [22]:
# there are 2 more data in preprocess data
# by comparing(HR) the head and tail of them, I found that preprocess_data contains two more extra data, other HR data is consistent

In [23]:
concat_data[0].head(10)

Unnamed: 0,ACC_X,ACC_Y,ACC_Z,BVP,EDA,HR,TEMP
0,2.59375,4.0625,61.28125,15.76,0.566449,118.0,34.79
1,-4.46875,6.5,63.15625,-41.960313,0.654746,113.5,34.79
2,-4.125,5.28125,63.65625,26.097031,0.664677,93.0,34.79
3,-7.625,5.46875,63.15625,4.685781,0.677812,93.25,34.68
4,-14.59375,5.875,62.28125,-7.058125,0.6855,86.4,34.66
5,-16.46875,6.34375,61.875,4.653594,0.697994,81.83,34.61
6,-14.03125,-0.53125,58.75,2.974531,0.692548,79.71,34.47
7,-33.15625,9.84375,58.3125,19.223125,0.616945,78.12,34.41
8,-28.46875,6.09375,57.1875,-5.9,0.617585,76.67,34.45
9,-29.5,5.65625,56.9375,-29.6425,0.618867,75.6,34.75


In [24]:
preprocess_data.head(10)

Unnamed: 0,Participant,HR,respr,Time(sec),Label
0,2,118.0,12.127693,1644227583,0
1,2,113.5,12.127693,1644227584,0
2,2,93.0,12.127693,1644227585,0
3,2,93.25,12.127693,1644227586,0
4,2,86.4,12.127693,1644227587,0
5,2,81.83,12.127693,1644227588,0
6,2,79.71,12.127693,1644227589,0
7,2,78.12,12.127693,1644227590,0
8,2,76.67,12.127693,1644227591,0
9,2,75.6,12.127693,1644227592,0


In [25]:
concat_data[33].tail(10)

Unnamed: 0,ACC_X,ACC_Y,ACC_Z,BVP,EDA,HR,TEMP
3503,-32.625,-1.5625,55.0625,9.633281,0.239876,66.22,32.27
3504,-32.96875,-1.875,54.84375,-35.441875,0.237953,65.38,32.25
3505,-32.59375,-1.15625,55.0,11.952969,0.237313,65.32,32.23
3506,-32.15625,-0.4375,55.25,6.491094,0.235391,65.22,32.25
3507,-33.5625,-4.65625,54.25,-9.8,0.235391,65.28,32.23
3508,-32.21875,-1.5625,55.25,3.337656,0.232188,65.37,32.25
3509,-32.28125,-1.8125,55.125,7.774687,0.230906,65.42,32.23
3510,-31.59375,0.1875,55.5,-3.985625,0.228664,65.45,32.23
3511,-36.0,1.1875,53.3125,-28.008594,0.228344,65.47,32.23
3512,-31.5,-2.6875,55.8125,35.968281,0.224179,65.5,32.21


In [26]:
preprocess_data.tail(10)

Unnamed: 0,Participant,HR,respr,Time(sec),Label
112462,35,65.32,11.674524,1646842238,0
112463,35,65.22,11.648924,1646842239,0
112464,35,65.28,11.623324,1646842240,0
112465,35,65.37,11.597724,1646842241,0
112466,35,65.42,11.572124,1646842242,0
112467,35,65.45,11.546524,1646842243,0
112468,35,65.47,11.520924,1646842244,0
112469,35,65.5,11.495324,1646842245,0
112470,35,65.5,11.466879,1646842246,0
112471,35,65.5,11.441279,1646842247,0


In [27]:
preprocess_data = preprocess_data.iloc[:row_number]

In [28]:
preprocess_data.tail(10)

Unnamed: 0,Participant,HR,respr,Time(sec),Label
112460,35,66.22,11.725724,1646842236,0
112461,35,65.38,11.700124,1646842237,0
112462,35,65.32,11.674524,1646842238,0
112463,35,65.22,11.648924,1646842239,0
112464,35,65.28,11.623324,1646842240,0
112465,35,65.37,11.597724,1646842241,0
112466,35,65.42,11.572124,1646842242,0
112467,35,65.45,11.546524,1646842243,0
112468,35,65.47,11.520924,1646842244,0
112469,35,65.5,11.495324,1646842245,0


In [29]:
# concat axis = 0
#combine_data = pd.concat(concat_data, axis = 0)

In [30]:
# add label
z = 2
for i in concat_data:
    i['respr'] = preprocess_data[preprocess_data['Participant'] == z].reset_index(drop = True)['respr']
    i['label'] = preprocess_data[preprocess_data['Participant'] == z].reset_index(drop = True)['Label']
    i['ID'] = preprocess_data[preprocess_data['Participant'] == z].reset_index(drop = True)['Participant']
    i['Time'] = preprocess_data[preprocess_data['Participant'] == z].reset_index(drop = True)['Time(sec)']
    z += 1

In [31]:
# concat axis = 0
final_data = pd.concat(concat_data, axis = 0)
final_data.reset_index(drop = True, inplace = True)

In [32]:
final_data.shape

(112470, 11)

In [33]:
findMissingData(final_data)

Unnamed: 0,ACC_X,ACC_Y,ACC_Z,BVP,EDA,HR,TEMP,respr,label,ID,Time
total,0,0,0,0,0,0,0,0,0,0,0
percent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
type,float64,float64,float64,float64,float64,float64,float64,float64,int64,int64,int64


In [34]:
final_data.head()

Unnamed: 0,ACC_X,ACC_Y,ACC_Z,BVP,EDA,HR,TEMP,respr,label,ID,Time
0,2.59375,4.0625,61.28125,15.76,0.566449,118.0,34.79,12.127693,0,2,1644227583
1,-4.46875,6.5,63.15625,-41.960313,0.654746,113.5,34.79,12.127693,0,2,1644227584
2,-4.125,5.28125,63.65625,26.097031,0.664677,93.0,34.79,12.127693,0,2,1644227585
3,-7.625,5.46875,63.15625,4.685781,0.677812,93.25,34.68,12.127693,0,2,1644227586
4,-14.59375,5.875,62.28125,-7.058125,0.6855,86.4,34.66,12.127693,0,2,1644227587


In [35]:
final_data.tail()

Unnamed: 0,ACC_X,ACC_Y,ACC_Z,BVP,EDA,HR,TEMP,respr,label,ID,Time
112465,-32.21875,-1.5625,55.25,3.337656,0.232188,65.37,32.25,11.597724,0,35,1646842241
112466,-32.28125,-1.8125,55.125,7.774687,0.230906,65.42,32.23,11.572124,0,35,1646842242
112467,-31.59375,0.1875,55.5,-3.985625,0.228664,65.45,32.23,11.546524,0,35,1646842243
112468,-36.0,1.1875,53.3125,-28.008594,0.228344,65.47,32.23,11.520924,0,35,1646842244
112469,-31.5,-2.6875,55.8125,35.968281,0.224179,65.5,32.21,11.495324,0,35,1646842245


In [36]:
#final_data.to_csv(os.path.join(os.getcwd(), 'Raw_data\\final_data.csv'))

In [37]:
# check whether the value of each HR data in the final data is same as the HR data in original preprocess data 
check_correspond = True
for i in range(final_data.shape[0]):
    if final_data['HR'][i] != preprocess_data['HR'][i]:
        check_correspond = False
check_correspond

True