# Data Pre-processing steps

In [101]:
# _importing required libraries
import pandas as pd
import os
import glob

from sklearn import preprocessing

In [102]:
# _file paths of input and output data
subject_id = 'subject101'
input_file_path = os.getcwd() + f'/../../data/input_dat/{subject_id}.dat'
output_file_path = os.getcwd() + f'/../../data/output_csv/{subject_id}_processed.csv'

In [103]:
# _giving column names considered columns from dataset are
# timestamp (s)
# activityID  
# heart rate (bpm)
# (IMU hand) 3D-acceleration data (ms-2), scale: ±16g, resolution: 13-bit 
# (IMU hand) 3D-gyroscope data (rad/s)
col_names = ['timestamp (s)', 'activityID', 'heart rate (bpm)', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']
required_cols = [0,1,2,4,5,6,10,11,12]
# _loading data into pandas dataframe
main_df = pd.read_csv(input_file_path, header=None, names=col_names, sep='\s+', usecols=required_cols, engine='python')
# _printing few records of data
main_df

Unnamed: 0,timestamp (s),activityID,heart rate (bpm),X1,Y1,Z1,X2,Y2,Z2
0,8.38,0,104.0,2.37223,8.60074,3.51048,-0.092217,0.056812,-0.015845
1,8.39,0,,2.18837,8.56560,3.66179,-0.024413,0.047758,0.006474
2,8.40,0,,2.37357,8.60107,3.54898,-0.057976,0.032574,-0.006988
3,8.41,0,,2.07473,8.52853,3.66021,-0.002352,0.032810,-0.003747
4,8.42,0,,2.22936,8.83122,3.70000,0.012269,0.018305,-0.053325
...,...,...,...,...,...,...,...,...,...
376412,3772.50,0,,2.02477,7.29553,5.74194,1.516160,-0.044713,-0.085853
376413,3772.51,0,,2.10836,7.86504,5.85674,1.542230,-0.023898,-0.075649
376414,3772.52,0,,2.07163,8.39581,5.77742,1.502390,-0.029592,-0.074682
376415,3772.53,0,,2.19569,8.77634,6.00892,1.332220,-0.052372,-0.034189


In [104]:
# _showing null values count for each column
main_df.isna().sum()

timestamp (s)            0
activityID               0
heart rate (bpm)    342028
X1                    1454
Y1                    1454
Z1                    1454
X2                    1454
Y2                    1454
Z2                    1454
dtype: int64

In [105]:
# _dropping heart rate column
del main_df['heart rate (bpm)']
# _dropping rows which has all NaN values in all sensor fields
main_df.dropna(how='all', subset= ['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2'], inplace= True)
main_df[pd.isnull(main_df).any(axis=1)]

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2


In [106]:
# _showing null values count for each column
main_df.isna().sum()

timestamp (s)    0
activityID       0
X1               0
Y1               0
Z1               0
X2               0
Y2               0
Z2               0
dtype: int64

In [107]:
# _columns list for which we need to perform ouliers operation
cols = ['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']
# _calculating threshold values to remove outliers which are +3 or -3 std away from mean
lower = main_df[cols].quantile(0.01)
higher  = main_df[cols].quantile(0.99)
print(lower,higher)

X1   -19.910666
Y1    -4.166594
Z1    -8.126850
X2    -2.848508
Y2    -2.527997
Z2    -5.461220
Name: 0.01, dtype: float64 X1     7.836709
Y1    28.373268
Z1    10.646538
X2     3.587558
Y2     2.952530
Z2     4.959376
Name: 0.99, dtype: float64


In [108]:
#_checking and removing outliers
main_df = main_df[((main_df[cols] < higher) & (main_df[cols] > lower)).any(axis=1)]

In [109]:
# _activityID 0 represents trasient activities which are not useful in this project.
# _droping the rows whose activityID column is 0
# _considering only three activities for basic testing
main_df = main_df[main_df['activityID'].isin([1,2,3])] 
main_df.groupby(['activityID']).agg(['count'])

Unnamed: 0_level_0,timestamp (s),X1,Y1,Z1,X2,Y2,Z2
Unnamed: 0_level_1,count,count,count,count,count,count,count
activityID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,27184,27184,27184,27184,27184,27184,27184
2,23471,23471,23471,23471,23471,23471,23471
3,21681,21681,21681,21681,21681,21681,21681


In [110]:
# # _perform min-max normalization
# cols = ['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']
# for col in cols:
#     main_df[col] = preprocessing.MinMaxScaler().fit_transform(main_df[[col]])
# main_df

In [111]:
# _perform z-score standardization
cols = ['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']
for col in cols:
    main_df[col] = preprocessing.StandardScaler().fit_transform(main_df[[col]])
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
2928,37.66,1,0.418461,0.925518,0.472759,-0.035942,0.138344,-0.011430
2929,37.67,1,0.432534,0.752532,0.514698,-0.556016,0.093168,-0.006729
2930,37.68,1,0.432340,0.601171,0.535716,-0.763257,0.039907,0.023608
2931,37.69,1,0.418957,0.601528,0.555967,-0.622059,0.069178,0.060307
2932,37.70,1,0.434205,0.634318,0.607554,-0.239073,-0.070386,0.034582
...,...,...,...,...,...,...,...,...
75307,761.45,3,0.444840,0.676110,0.463848,-0.931057,0.355296,-0.051148
75308,761.46,3,0.446754,0.612022,0.556538,-1.051936,0.397559,-0.107791
75309,761.47,3,0.410890,0.514726,0.525248,-0.428415,0.275692,-0.130117
75310,761.48,3,0.410742,0.752398,0.473136,0.136853,0.130708,0.022278


In [112]:
# _write data to ouput file
main_df.to_csv(output_file_path, encoding='utf-8', index=False, header=True)

# Merging output csv files

In [113]:
def merge_csv_files(input_csv_file_path, merged_csv_file_path):
    
    # _get all csv files inside file_path
    csv_files = glob.glob(input_csv_file_path)

    df_list = []
    for csv in csv_files:
        df_list.append(pd.read_csv(csv))

    # _stack all the csv files into a single file
    result = pd.concat(df_list)
    
    # _write data to ouput file
    result.to_csv(merged_csv_file_path, encoding='utf-8', index=False, header=False)

In [115]:
# _file paths of input and output data
input_csv_file_path = os.getcwd() + f'/../../data/output_csv/*.csv'
merged_csv_file_path = os.getcwd() + f'/../../data/processed_data.csv'
merge_csv_files(input_csv_file_path, merged_csv_file_path)