# Data Pre-processing steps

In [1]:
# _importing required libraries
import pandas as pd
import os
import glob

In [2]:
# _giving column names considered columns from dataset are
# timestamp (s)
# activityID  
# heart rate (bpm)
# (IMU hand) 3D-acceleration data (ms-2), scale: ±16g, resolution: 13-bit 
# (IMU hand) 3D-gyroscope data (rad/s)
#subject_id = ['subject101','subject102','subject103','subject104','subject105','subject106','subject107','subject108','subject109']
def load_data(subject_id):
    col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']
    required_cols = [0,1,4,5,6,10,11,12]
    main_df=pd.DataFrame()
    for idx in subject_id:
        sub_id = idx[-3:]
        input_file_path = os.getcwd() + f'/../data/input_dat/{idx}.dat'
        df = pd.read_csv(input_file_path, header=None, names=col_names, sep='\s+', usecols=required_cols, engine='python')
        df.insert(1, 'subject_id', sub_id)
        main_df=main_df.append(df)
    return main_df

In [3]:
subject_id = ['subject101','subject102','subject103','subject104','subject105','subject106','subject107','subject108','subject109']
#subject_id = ['subject101']
main_df = load_data(subject_id)

In [4]:
main_df.reset_index()

Unnamed: 0,index,timestamp (s),subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2
0,0,8.38,101,0,2.37223,8.60074,3.51048,-0.092217,0.056812,-0.015845
1,1,8.39,101,0,2.18837,8.56560,3.66179,-0.024413,0.047758,0.006474
2,2,8.40,101,0,2.37357,8.60107,3.54898,-0.057976,0.032574,-0.006988
3,3,8.41,101,0,2.07473,8.52853,3.66021,-0.002352,0.032810,-0.003747
4,4,8.42,101,0,2.22936,8.83122,3.70000,0.012269,0.018305,-0.053325
...,...,...,...,...,...,...,...,...,...,...
2872528,8472,100.19,109,0,-4.71493,10.22250,4.66893,1.784060,0.087985,0.934673
2872529,8473,100.20,109,0,-4.95932,10.37130,4.12594,1.475210,-0.028917,0.806540
2872530,8474,100.21,109,0,-4.93997,9.83615,3.70468,1.290990,-0.155493,0.762446
2872531,8475,100.22,109,0,-4.64941,9.11129,3.51904,1.207940,-0.311214,0.748814


In [5]:
# _showing null values count for each column
main_df.isna().sum()

timestamp (s)        0
subject_id           0
activityID           0
X1               13141
Y1               13141
Z1               13141
X2               13141
Y2               13141
Z2               13141
dtype: int64

In [6]:
# _dropping heart rate column
#del main_df['heart rate (bpm)']
# _dropping rows which has all NaN values in all sensor fields
main_df.dropna(how='all', subset= ['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2'], inplace= True)
main_df[pd.isnull(main_df).any(axis=1)]

Unnamed: 0,timestamp (s),subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2


In [7]:
# _showing null values count for each column
main_df.isna().sum()

timestamp (s)    0
subject_id       0
activityID       0
X1               0
Y1               0
Z1               0
X2               0
Y2               0
Z2               0
dtype: int64

In [8]:
# _activityID 0 represents trasient activities which are not useful in this project.
# _droping the rows whose activityID column is 0
# _considering only three activities for basic testing
main_df = main_df[main_df['activityID'].isin([1,2,3,4,6,7,16,17])] 
main_df.groupby(['activityID']).agg(['count'])

Unnamed: 0_level_0,timestamp (s),subject_id,X1,Y1,Z1,X2,Y2,Z2
Unnamed: 0_level_1,count,count,count,count,count,count,count,count
activityID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,192498,192498,192498,192498,192498,192498,192498,192498
2,185025,185025,185025,185025,185025,185025,185025,185025
3,189777,189777,189777,189777,189777,189777,189777,189777
4,232454,232454,232454,232454,232454,232454,232454,232454
6,164530,164530,164530,164530,164530,164530,164530,164530
7,186114,186114,186114,186114,186114,186114,186114,186114
16,175282,175282,175282,175282,175282,175282,175282,175282
17,238462,238462,238462,238462,238462,238462,238462,238462


In [9]:
# _write data to ouput file
output_file_path = os.getcwd() + f'/../data/output_csv/processed_data.csv'
main_df.to_csv(output_file_path, encoding='utf-8', index=False, header=False)