# Data Pre-processing steps

In [26]:
# _importing required libraries
import pandas as pd
import os
import glob

In [27]:
# _giving column names considered columns from dataset are
# timestamp (s)
# activityID  
# heart rate (bpm)
# (IMU hand) 3D-acceleration data (ms-2), scale: ±16g, resolution: 13-bit 
# (IMU hand) 3D-gyroscope data (rad/s)
#subject_id = ['subject101','subject102','subject103','subject104','subject105','subject106','subject107','subject108','subject109']
def load_data(subject_id):
    col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2','X3', 'Y3', 'Z3']
    required_cols = [0,1,4,5,6,7,8,9,10,11,12]
    main_df=pd.DataFrame()
    for idx in subject_id:
        sub_id = idx[-3:]
        input_file_path = os.getcwd() + f'/../data/input_dat/{idx}.dat'
        df = pd.read_csv(input_file_path, header=None, names=col_names, sep='\s+', usecols=required_cols, engine='python')
        df.insert(1, 'subject_id', sub_id)
        main_df=main_df.append(df)
    return main_df

In [28]:
#subject_id = ['subject101','subject102','subject103','subject104','subject105','subject106','subject107','subject108','subject109']
subject_id = ['subject101']
main_df = load_data(subject_id)

In [29]:
main_df.reset_index()

Unnamed: 0,index,timestamp (s),subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,X3,Y3,Z3
0,0,8.38,101,0,2.37223,8.60074,3.51048,2.43954,8.76165,3.35465,-0.092217,0.056812,-0.015845
1,1,8.39,101,0,2.18837,8.56560,3.66179,2.39494,8.55081,3.64207,-0.024413,0.047758,0.006474
2,2,8.40,101,0,2.37357,8.60107,3.54898,2.30514,8.53644,3.73280,-0.057976,0.032574,-0.006988
3,3,8.41,101,0,2.07473,8.52853,3.66021,2.33528,8.53622,3.73277,-0.002352,0.032810,-0.003747
4,4,8.42,101,0,2.22936,8.83122,3.70000,2.23055,8.59741,3.76295,0.012269,0.018305,-0.053325
...,...,...,...,...,...,...,...,...,...,...,...,...,...
376412,376412,3772.50,101,0,2.02477,7.29553,5.74194,2.06573,6.57692,5.92695,1.516160,-0.044713,-0.085853
376413,376413,3772.51,101,0,2.10836,7.86504,5.85674,2.08754,7.42244,5.87977,1.542230,-0.023898,-0.075649
376414,376414,3772.52,101,0,2.07163,8.39581,5.77742,2.13833,8.05640,5.90853,1.502390,-0.029592,-0.074682
376415,376415,3772.53,101,0,2.19569,8.77634,6.00892,2.11251,8.53989,5.93770,1.332220,-0.052372,-0.034189


In [30]:
# _showing null values count for each column
main_df.isna().sum()

timestamp (s)       0
subject_id          0
activityID          0
X1               1454
Y1               1454
Z1               1454
X2               1454
Y2               1454
Z2               1454
X3               1454
Y3               1454
Z3               1454
dtype: int64

In [31]:
# _dropping heart rate column
#del main_df['heart rate (bpm)']
# _dropping rows which has all NaN values in all sensor fields
main_df.dropna(how='all', subset= ['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2'], inplace= True)
main_df[pd.isnull(main_df).any(axis=1)]

Unnamed: 0,timestamp (s),subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,X3,Y3,Z3


In [32]:
# _showing null values count for each column
main_df.isna().sum()

timestamp (s)    0
subject_id       0
activityID       0
X1               0
Y1               0
Z1               0
X2               0
Y2               0
Z2               0
X3               0
Y3               0
Z3               0
dtype: int64

In [34]:
# _activityID 0 represents trasient activities which are not useful in this project.
# _droping the rows whose activityID column is 0
# _considering only three activities for basic testing
#main_df = main_df[main_df['activityID'].isin([1,2,3,4,6,7,16,17])] 
main_df = main_df[main_df['activityID'].isin([16,17])] 
main_df.groupby(['activityID']).agg(['count'])

Unnamed: 0_level_0,timestamp (s),subject_id,X1,Y1,Z1,X2,Y2,Z2,X3,Y3,Z3
Unnamed: 0_level_1,count,count,count,count,count,count,count,count,count,count,count
activityID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
16,22930,22930,22930,22930,22930,22930,22930,22930,22930,22930,22930
17,23553,23553,23553,23553,23553,23553,23553,23553,23553,23553,23553


In [35]:
# _write data to ouput file
output_file_path = os.getcwd() + f'/../data/output_csv/processed_data_IMUwrist.csv'
main_df.to_csv(output_file_path, encoding='utf-8', index=False, header=False)