#### Libraries

In [1]:
# Data manipulation libs
import pandas as pd
import numpy as np
import datetime

#### Settings

In [2]:
# Plot settings
line_width = 0.75

# Print all columns from pandas df
pd.set_option('display.max_columns', None)

# Print all columns from pandas df
# pd.set_option('display.max_rows', None)

#### Subject Data

In [3]:
# Get subjects info
## PATH MAY NEED TO CHANGE
subjects_information_df = pd.read_csv('../../CDL Usecases/data_subjects_info.csv')
'''
Column  Attribute         [Unit]
Code:   subject ID        [1 to 24]
Weight: Weight of subject [Kg.]
Height: Weight of subject [Cm.]
Age:    Age of subject    [Years]
Gender: Gender of subject [0: F, 1: M]
'''
subjects_information_df.head(3)

Unnamed: 0,code,weight,height,age,gender
0,1,102,188,46,1
1,2,72,180,28,1
2,3,48,161,28,0


#### Test Data Description

The test data was collected during these 6 trials:
- Downstairs $\;$[dws]
- Upstairs   $\;$[ups]
- Walking    $\;$[wlk]
- Jogging    $\;$[jog]
- Sitting    $\;$[sit]
- Standing   $\;$[std]

# Data Loading / Cleaning

#### Define parameters for data importing  / cleaning

In [4]:
'''
Dictionary to specify file numbers of trials so all data can be loaded
'''
trial_id_dict = {
    'dws': [1, 2, 11],
    'ups': [3, 4, 12],
    'wlk': [7, 8, 15],
    'jog': [9, 16],
    'sit': [5, 13],
    'std': [6, 14]
}

'''
Get dictionary for all gyroscopic measurements
'''
measurement_dict = {
    'attitude': ['attitude.roll', 'attitude.pitch', 'attitude.yaw'],
    'gravity':  ['gravity.x', 'gravity.y', 'gravity.z'],
    'rotationRate': ['rotationRate.x', 'rotationRate.y', 'rotationRate.z'],
    'userAcceleration': ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']
}

'''
Specify number of subjects and list of subject numbers
*Assumes the subject ID's go from 1 to n incremented by 1*
'''
subject_number = 24
subject_id_lst = list(range(1, subject_number+1))
'''
Get folder location with test data
'''
## PATH MAY NEED TO CHANGE
test_data = '../../CDL Usecases/A_DeviceMotion_data/'
'''
Select location to write cleaned dataframe
'''
## PATH MAY NEED TO CHANGE
write_cleaned_data = '../../CDL Usecases/'

#### Import Data

In [5]:
# Init all_data_dictionary for storing all dataframes
all_data_dictionary = {}

# Init start_date for creating time_series_data in data load loop
start_date = '01/01/2022'
start_datetime = datetime.datetime.strptime(start_date, '%m/%d/%Y')

'''
Fill all_data_dictionary with dataframes from each trial
'''
for key in trial_id_dict:
    # Init pandas dataframe for each key
    all_data_dictionary[key] = pd.DataFrame()
    for subject_id in subject_id_lst:
        for test_numbers in trial_id_dict[key]:
            # Read in data from file
            temp_df = pd.read_csv(f'{test_data}{key}_{test_numbers}/sub_{subject_id}.csv').iloc[:, 1:]
            # Add trial number, subject ID, and test type
            temp_df['test_type'] = key
            temp_df['subject_id'] = subject_id
            temp_df['test_trial_number'] = test_numbers
            # Append data to dataframe in dict
            all_data_dictionary[key] = pd.concat([all_data_dictionary[key], temp_df])
    '''
    Reset index of each df
    Rename first column to tick_num
    Calculate time since beginning of trial in seconds
    Get time series column
    Get magnitude [(x^2+y^2+z^2)^(1/2)] for gyro variable
    Join subject data
    ''' 
    all_data_dictionary[key] = all_data_dictionary[key].reset_index()
    all_data_dictionary[key] = all_data_dictionary[key].rename(columns={'index': 'tick_num'})
    all_data_dictionary[key]['time_since_start'] = (all_data_dictionary[key]['tick_num']/50)
    all_data_dictionary[key]['time_series_data'] = start_datetime + pd.to_timedelta(all_data_dictionary[key]['time_since_start'], unit='s')

    for measurement in measurement_dict:
        m_temp = measurement_dict[measurement]
        x_2 = all_data_dictionary[key][m_temp[0]]**2
        y_2 = all_data_dictionary[key][m_temp[1]]**2
        z_2 = all_data_dictionary[key][m_temp[2]]**2
        all_data_dictionary[key][measurement] = (x_2+y_2+z_2)**(1/2)
    all_data_dictionary[key] = all_data_dictionary[key].join(subjects_information_df.set_index('code'), on = 'subject_id')

In [6]:
# A dataframe exists for each key in the all_data_dictionary dictionary
all_data_dictionary['dws'].head(4)

Unnamed: 0,tick_num,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,test_type,subject_id,test_trial_number,time_since_start,time_series_data,attitude,gravity,rotationRate,userAcceleration,weight,height,age,gender
0,0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.77818,1.082764,0.294894,-0.184493,0.377542,dws,1,1,0.0,2022-01-01 00:00:00.000,1.832682,1.0,1.370498,0.51336,102,188,46,1
1,1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,dws,1,1,0.02,2022-01-01 00:00:00.020,1.818843,1.0,1.141648,0.250235,102,188,46,1
2,2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,dws,1,1,0.04,2022-01-01 00:00:00.040,1.812205,1.0,0.37253,0.21545,102,188,46,1
3,3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.04114,-0.025005,-1.048717,0.03586,-0.008389,0.136788,0.094958,dws,1,1,0.06,2022-01-01 00:00:00.060,1.803822,1.0,1.049628,0.166728,102,188,46,1


#### Create cumulative dataframe

In [7]:
# Create dataframe with all data
df_all_data = pd.DataFrame()
for key in all_data_dictionary:
    df_all_data = pd.concat([df_all_data, all_data_dictionary[key]])

# View data
df_all_data.head(4)

Unnamed: 0,tick_num,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,test_type,subject_id,test_trial_number,time_since_start,time_series_data,attitude,gravity,rotationRate,userAcceleration,weight,height,age,gender
0,0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.77818,1.082764,0.294894,-0.184493,0.377542,dws,1,1,0.0,2022-01-01 00:00:00.000,1.832682,1.0,1.370498,0.51336,102,188,46,1
1,1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,dws,1,1,0.02,2022-01-01 00:00:00.020,1.818843,1.0,1.141648,0.250235,102,188,46,1
2,2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,dws,1,1,0.04,2022-01-01 00:00:00.040,1.812205,1.0,0.37253,0.21545,102,188,46,1
3,3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.04114,-0.025005,-1.048717,0.03586,-0.008389,0.136788,0.094958,dws,1,1,0.06,2022-01-01 00:00:00.060,1.803822,1.0,1.049628,0.166728,102,188,46,1


#### Write to .csv

In [8]:
# Write new csv to file
df_all_data.to_csv(path_or_buf = write_cleaned_data + 'df_all_data.csv', sep = ',', index=False) # Uncomment to write data