In [1]:
import warnings
import os
import glob
import numpy as np
import pandas as pd

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# function to check the missing value
def findMissingData(df):
    total = df.isnull().sum()
    percent = total/df.shape[0]
    new_df = pd.concat([total,percent], axis = 1, keys = ['total','percent'])
    t = []
    types = df.dtypes
    for i in types:
        t.append(i)
    new_df['type'] = t
    new_df = np.transpose(new_df)
    return new_df

In [4]:
target_dir = os.path.join(os.getcwd(), 'Physiological signals') # path for the raw data file folder
tag = 'tags'
ibi = 'IBI'
l = {}
n2 = 1
col = [['ACC_X','ACC_Y', 'ACC_Z'], ['BVP'], ['EDA'], ['HR'], ['TEMP']]
freq = [32, 64, 4, 1, 4]
for i in os.listdir(target_dir):
    n1 = 1
    data_folder = os.path.join(target_dir, i) #  path for each participant folder
    if os.path.isdir(data_folder):
        target_file = glob.glob(os.path.join(data_folder, '*.csv'))
        target_file = [file for file in target_file if tag not in os.path.basename(file) and ibi not in os.path.basename(file)]
        for j in target_file:
            data = pd.read_csv(j, skiprows = 1)
            data = data.groupby(data.index//freq[n1-1])[data.columns].mean() # avg the data
            data.columns = col[n1-1]
            if n2 in l:
                l[n2].append(data)
            else:
                l[n2] = []
                l[n2].append(data)
            n1 += 1
        n2 += 1

In [5]:
def structure(n):
    k = 0
    for i in l[n]:
        print(col[k],i.shape)
        k += 1

In [6]:
structure(1)

['ACC_X', 'ACC_Y', 'ACC_Z'] (936, 3)
['BVP'] (936, 1)
['EDA'] (936, 1)
['HR'] (926, 1)
['TEMP'] (936, 1)


In [7]:
structure(5)

['ACC_X', 'ACC_Y', 'ACC_Z'] (4313, 3)
['BVP'] (4312, 1)
['EDA'] (4313, 1)
['HR'] (4302, 1)
['TEMP'] (4312, 1)


In [8]:
structure(10)

['ACC_X', 'ACC_Y', 'ACC_Z'] (574, 3)
['BVP'] (574, 1)
['EDA'] (573, 1)
['HR'] (564, 1)
['TEMP'] (574, 1)


In [9]:
processed_data = pd.read_csv('downsample.csv')

In [10]:
findMissingData(processed_data)

Unnamed: 0,time(s),bvp,eda,hr,x,y,z,LABEL
total,0,0,0,0,0,0,0,0
percent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
type,float64,float64,float64,float64,float64,float64,float64,int64


In [11]:
processed_data[processed_data['time(s)'].diff()!=0.25].shape[0] # find the total participant in the original processed__data

24

In [12]:
index = processed_data[processed_data['time(s)'].diff()!=0.25]['time(s)'].index
index

Index([    0,  3701,  5406,  7099,  9072, 11006, 14056, 15563, 17227, 20487,
       23519, 27925, 30129, 33069, 35321, 38714, 41067, 43716, 46581, 48965,
       51098, 63225, 76018, 86607],
      dtype='int64')

In [13]:
# split the processed data so that each dataframe represents one participant
participant_df = []
i = 0
while i <= len(index)-1:
    if i == len(index)-1:
        split_df = processed_data.iloc[index[i]:].reset_index(drop = True)
    else:
        split_df = processed_data.iloc[index[i]: index[i+1]].reset_index(drop = True)
    participant_df.append(split_df)
    i += 1

In [14]:
len(participant_df)

24

In [15]:
participant_df[-1]

Unnamed: 0,time(s),bvp,eda,hr,x,y,z,LABEL
0,10.25,-25.44,4.289751,116.00,23.875,-15.000,-57.000,0
1,10.50,24.91,4.288470,116.00,23.625,-14.750,-56.750,0
2,10.75,56.77,4.285910,116.00,23.250,-13.875,-57.750,0
3,11.00,-10.30,4.287190,116.00,21.625,-13.750,-58.125,0
4,11.25,15.38,4.265423,116.00,22.250,-14.000,-58.000,0
...,...,...,...,...,...,...,...,...
17200,4310.25,-94.73,0.520112,86.93,60.500,-14.750,7.750,1
17201,4310.50,-56.31,0.517551,86.93,65.375,-15.125,7.625,1
17202,4310.75,24.15,0.517551,86.93,64.625,-12.375,6.000,1
17203,4311.00,5.48,0.509868,86.93,62.875,-14.625,5.625,1


In [16]:
participant_df[-2]

Unnamed: 0,time(s),bvp,eda,hr,x,y,z,LABEL
0,10.25,-8.20,0.157480,83.00,46.250,-26.875,41.250,1
1,10.50,12.27,0.158761,83.00,56.250,-15.625,44.375,1
2,10.75,14.72,0.153639,83.00,40.875,-5.750,44.500,1
3,11.00,-3.17,0.153639,83.00,35.875,-0.250,51.000,1
4,11.25,6.84,0.158761,83.00,29.875,6.875,56.875,1
...,...,...,...,...,...,...,...,...
10584,2656.25,0.67,0.348249,81.25,56.375,-32.750,-4.625,0
10585,2656.50,-12.30,0.348249,81.25,56.875,-31.875,-4.625,0
10586,2656.75,5.37,0.345689,81.25,56.750,-32.000,-4.875,0
10587,2657.00,-9.57,0.344408,81.25,56.875,-32.500,-4.750,0


In [17]:
# change the frequency from 4hz to 1hz
participant_df_rescale = []
for i in participant_df:
    averaged_data = i.groupby(i.index//4)[i.columns].mean()
    participant_df_rescale.append(averaged_data)

In [18]:
for i in l:
    print(i, l[i][3].shape[0])

1 926
2 3033
3 3200
4 2648
5 4302
6 1103
7 1501
8 1523
9 552
10 564
11 849
12 589
13 663
14 717
15 597
16 534
17 427
18 424
19 494
20 484
21 764
22 377
23 128
24 417
25 816
26 759
27 751
28 671


In [19]:
k = 101
print('Number', 'Size')
for i in participant_df_rescale:
    print(k, '     ',i.shape[0])
    k += 1

Number Size
101       926
102       427
103       424
104       494
105       484
106       763
107       377
108       416
109       815
110       758
111       1102
112       551
113       735
114       563
115       849
116       589
117       663
118       717
119       596
120       534
121       3032
122       3199
123       2648
124       4302


In [20]:
l[6][3]

Unnamed: 0,HR
0,85.00
1,85.00
2,73.67
3,67.50
4,72.00
...,...
1098,95.63
1099,95.73
1100,95.85
1101,95.93


In [21]:
participant_df_rescale[10]['hr']

0       85.00
1       85.00
2       73.67
3       67.50
4       72.00
        ...  
1097    95.47
1098    95.63
1099    95.73
1100    95.85
1101    95.93
Name: hr, Length: 1102, dtype: float64

### The above example shows some participants' data in the raw data has one more extra data compared with the same participant infromation in the original preprocessed data. The extra data seems to be the last one, which is pretty simlilar to the data structure for another dataset that has been preprocessed.
### Also one thing has to be noticed, the result shows that the number of participant in raw data and original preprocessed data is actually different, which means some raw data(1-28) is not included in the original preprocessed data(101-124).

In [22]:
# first to concat raw data axis = 1, but for the raw data, we have to make other features have the same shape as hr
# (It seems that hr feature has the least number of data for all participant in the raw data)

# reshape
for i in l:
    for j in range(5):
        l[i][j] = l[i][j].iloc[:len(l[i][3])]
        
# concat axis = 1
concat_data = []
for i in l:
    concat_data.append(pd.concat(l[i], axis = 1))

In [23]:
concat_data[0]

Unnamed: 0,ACC_X,ACC_Y,ACC_Z,BVP,EDA,HR,TEMP
0,13.81250,4.03125,62.84375,15.760000,0.853272,52.00,33.75
1,15.62500,4.96875,62.12500,-41.960313,1.383257,52.00,33.75
2,24.37500,2.31250,59.46875,26.299375,1.441569,55.00,33.75
3,25.81250,3.56250,58.93750,4.384375,1.458265,56.25,33.77
4,26.00000,4.03125,58.93750,-14.118594,1.456984,55.60,33.77
...,...,...,...,...,...,...,...
921,63.68750,10.03125,12.43750,1.656094,7.050238,107.25,32.65
922,54.31250,19.43750,23.40625,-1.693750,7.146595,107.13,32.65
923,8.21875,-2.43750,62.90625,17.555625,7.117464,107.02,32.66
924,2.40625,-2.03125,64.28125,-12.464531,7.054720,106.97,32.66


### concat_data: obtained from raw data
### participant_df_rescale: obtained from the original preprocessed data

In [24]:
# start from 0 now
k = 0
for i in concat_data:
    print(k, i.shape[0])
    k+=1

0 926
1 3033
2 3200
3 2648
4 4302
5 1103
6 1501
7 1523
8 552
9 564
10 849
11 589
12 663
13 717
14 597
15 534
16 427
17 424
18 494
19 484
20 764
21 377
22 128
23 417
24 816
25 759
26 751
27 671


In [25]:
k = 100
print('Number', 'Size')
for i in participant_df_rescale:
    print(k, '     ',i.shape[0])
    k += 1

Number Size
100       926
101       427
102       424
103       494
104       484
105       763
106       377
107       416
108       815
109       758
110       1102
111       551
112       735
113       563
114       849
115       589
116       663
117       717
118       596
119       534
120       3032
121       3199
122       2648
123       4302


### After checking them one by one, I find the the 12th(index:112, size:735) participant data in the original preprocessed dataframe has no corresponding data to the raw one. In that case, this one will be removed from the original preprocessed data
### ***For the reason why I also remove 4th data, it's because later while I checked if all HR data for 4th data is same as the correponding data in the raw data. Plenty of data is not same. So I decide to remove it.

In [26]:
# remove 12th participant data in the original preprocessed data
participant_df_rescale.pop(12)
participant_df_rescale.pop(4)

Unnamed: 0,time(s),bvp,eda,hr,x,y,z,LABEL
0,10.375,23.8250,1.668942,0.00,28.28125,25.68750,32.71875,1.0
1,11.375,-151.6600,1.657742,64.00,20.46875,17.12500,45.56250,1.0
2,12.375,287.4675,1.688462,64.00,58.46875,27.93750,8.50000,1.0
3,13.375,-145.9800,1.699982,58.67,58.84375,26.65625,8.84375,1.0
4,14.375,-11.0800,1.728782,60.50,58.81250,25.68750,10.12500,1.0
...,...,...,...,...,...,...,...,...
479,489.375,-30.4550,3.542696,94.53,36.34375,-37.09375,39.25000,0.0
480,490.375,2.6875,3.622057,94.63,38.34375,-31.37500,41.87500,0.0
481,491.375,26.5750,3.691178,94.72,40.06250,-25.93750,44.09375,0.0
482,492.375,-6.7600,3.563816,94.80,39.71875,-25.65625,44.53125,0.0


In [27]:
len(participant_df_rescale)

22

In [28]:
# remove the one extra data for some participants' raw data 
index_removed = [1, 2, 5, 8, 9, 14, 20, 23, 24, 25]
for i in index_removed:
    concat_data[i] = concat_data[i].iloc[:-1]

In [29]:
# check if the extra data is removed
k = 0
for i in concat_data:
    print(k, i.shape[0])
    k+=1

0 926
1 3032
2 3199
3 2648
4 4302
5 1102
6 1501
7 1523
8 551
9 563
10 849
11 589
12 663
13 717
14 596
15 534
16 427
17 424
18 494
19 484
20 763
21 377
22 128
23 416
24 815
25 758
26 751
27 671


In [30]:
# only keep the needed raw data
index_kept = [0, ]
new_index =[0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 23, 24, 25]# 6, 7, 19 and 22 are removed
new_concat_data = [concat_data[i] for i in new_index] 

In [31]:
len(new_concat_data) == len(participant_df_rescale) # double check the number of participant in both data is same

True

In [32]:
# Check if each participant data in both same in the raw data(new_concat_data) and the original processed data(participant_df_rescale)
# before that, reorder the list so that the dataframe from two different sources can be corresponding
# use dataframe size as index
d1 = {}
d2 = {}
for i in new_concat_data:
    d1[i.shape[0]] = i
for i in participant_df_rescale:
    d2[i.shape[0]] = i

In [33]:
d1.keys() == d2.keys()

True

In [34]:
sorted_d1 = dict(sorted(d1.items()))
sorted_d2 = dict(sorted(d2.items()))

In [35]:
sorted_d1.keys() == sorted_d2.keys()

True

In [36]:
# check all hr data from raw data and preprocessed data is the same or not
# True if same
# False if not same
# The 4th data I mentioned previously was removed due to this reason.
check = True
z = 0
for i in sorted_d1:
    for j in range(d1[i].shape[0]):
        if sorted_d1 [i]['HR'][j]!= sorted_d2 [i]['hr'][j]:
            #print(sorted_d1[i]['HR'][j], sorted_d2[i]['hr'][j])
            check = False
            #print(i)
check

True

In [37]:
d1[377].tail(20)

Unnamed: 0,ACC_X,ACC_Y,ACC_Z,BVP,EDA,HR,TEMP
357,-64.25,-11.0,14.09375,2.976562,0.559164,75.22,36.95
358,-64.21875,-11.0,14.1875,-11.159687,0.559484,75.37,36.97
359,-64.34375,-11.0,14.375,-12.503594,0.560764,75.52,36.97
360,-64.25,-11.03125,14.65625,12.212969,0.556283,75.65,36.97
361,-64.15625,-10.96875,14.53125,9.622187,0.557563,75.75,36.99
362,-64.28125,-11.125,14.4375,-10.979688,0.554042,75.82,36.97
363,-64.21875,-11.03125,14.375,0.520938,0.555643,75.85,36.97
364,-64.09375,-11.0,14.46875,6.647188,0.552442,75.85,36.95
365,-64.09375,-11.0,14.59375,-12.80375,0.550521,75.87,36.95
366,-64.0625,-11.0,14.625,7.657812,0.550201,75.85,36.95


In [38]:
d2[377].tail(20)

Unnamed: 0,time(s),bvp,eda,hr,x,y,z,LABEL
357,367.375,12.7725,0.547961,75.22,-64.21875,-11.0,14.46875,0.0
358,368.375,-12.635,0.551482,75.37,-64.0625,-11.0,14.5625,0.0
359,369.375,22.1375,0.548601,75.52,-64.25,-11.0,14.375,0.0
360,370.375,5.5775,0.550201,75.65,-64.34375,-11.03125,14.28125,0.0
361,371.375,2.3175,0.54764,75.75,-64.34375,-11.03125,14.0625,0.0
362,372.375,4.5075,0.548921,75.82,-64.375,-11.09375,14.0625,0.0
363,373.375,-27.545,0.549561,75.85,-64.34375,-10.96875,14.15625,0.0
364,374.375,22.4075,0.550841,75.85,-64.3125,-11.0,14.09375,0.0
365,375.375,10.47,0.549881,75.87,-64.34375,-11.03125,14.0625,0.0
366,376.375,-8.2375,0.548601,75.85,-64.4375,-11.0,14.0625,0.0


### Now the biggest problem is that even though all hr data is corresponding, but when I checked the other features, it obviously different. This is really weird, so I decide not use BVP, EDA, and ACC feature data from raw data. Only TEMP will be used and added into preprocessed data based on the corresponding HR data.

In [39]:
n = 1
for i in sorted_d2:
    sorted_d2[i]['id'] = n
    sorted_d2[i]['temp'] = sorted_d1[i]['TEMP']
    n+=1

In [40]:
# concat axis = 0
data1 = [sorted_d2[i] for i in sorted_d2]
final_data = pd.concat(data1, axis = 0).reset_index(drop = True)

In [41]:
reset_col = ['id', 'time(s)', 'bvp', 'eda', 'hr', 'x', 'y', 'z', 'temp', 'LABEL']
final_data = final_data[reset_col]

In [42]:
final_data.shape

(24745, 10)

In [43]:
final_data.head()

Unnamed: 0,id,time(s),bvp,eda,hr,x,y,z,temp,LABEL
0,1,10.375,-10.43,0.40749,83.0,-38.4375,-8.5625,53.09375,38.27,0.0
1,1,11.375,6.535,0.40813,83.0,-39.03125,-9.0,52.5625,38.27,0.0
2,1,12.375,27.2425,0.40941,76.33,-38.96875,-9.0,52.65625,38.27,0.0
3,1,13.375,8.135,0.411331,85.5,-39.0,-9.0,52.71875,38.27,0.0
4,1,14.375,-3.4575,0.41165,78.8,-38.96875,-9.03125,52.875,38.23,0.0


In [44]:
final_data.to_csv(os.path.join(os.getcwd(), 'Raw_data\\final_data2.csv'))

In [45]:
print(f'{int(processed_data.shape[0]/4 - final_data.shape[0])} data is removed from the original preprocessed data(1hz not 4hz)')

1208 data is removed from the original preprocessed data(1hz not 4hz)
