# 03b - Data Preparation - Features Creation - Run for All Subjects
* (1) Create df_acc files for all subjects
* (2) Create df_fea filess for all subjects

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from constants import Constants
from joblib import Parallel, delayed
import glob

In [None]:
'''
(1) Create feature file in a DataFrame and save to parquet and csv formats
'''

In [16]:
'''
(1a) Converting all df_acc files
'''

def _convert_df_acc(all_files, file_id):
    # Load one accelerometer data file into dataframe 
    file_id, df = utils.load_single_data(all_files, file_id)
    subject_id = file_id.split('/')[-1].split('.')[0]
    print('Begin to convert acc dataframe for Subject='+subject_id)
    
    # General setting
    location_id = ['la']
    win_length = 6  # default will be 10-sec for each event
    win_shift = 3  # No overlapping between each event 
    acc_inputs = ['la_x', 'la_y', 'la_z']   # Only select Left-Ankle for now
    activities = [1, 2, 3, 4, 77, 99]       # All 6 activities with annotations
    activities_TP = [1, 2, 3]               # 1=walking; 2=descending stairs; 3=ascending stairs;
    activities_TN = [4, 77, 99]             # 4=driving; 77=clapping; 99=non-study activity;
    
    # Create an empty dataframe with defaulted column names 
    columns_features = ['subject_id', 'device_loc', 'act_id', 'act_name', 'event_num', 'walk_or_not', 'unique_id', 'time', 'acc_x', 'acc_y', 'acc_z']
    df_fea_all = pd.DataFrame(columns=columns_features)
    
    device_loc = utils.get_device_location_for_df(location_id[0])
    
    for activity in activities:
        # create a temporal dataframe 
        df_fea_layer_1 = pd.DataFrame(columns=columns_features)
        
        df_act = df[df.activity==activity]
        act_id = activity
        act_name = utils.get_activity_type_for_df(activity)
    
        # Begin to cut data into several 10-sec events
        total_segments_sec = round( (df_act.time_s.iloc[-1]-df_act.time_s.iloc[0]) / ( win_length-(win_length-win_shift) ) )  # compute how many segmnets in second will be cut
        for count, num in enumerate(range(0, total_segments_sec), start=1):
            df_fea_layer_2 = pd.DataFrame(columns=columns_features)
            
            cut_begin = df_act.time_s.iloc[0] + num*win_length
            cut_end   = cut_begin + win_length 
            df_cut_tmp = df_act[(df_act.time_s>=cut_begin) & (df_act.time_s<=cut_end)]
            
            # Write values into a temporal feature dataframe
            if df_cut_tmp.empty:
                df_fea_layer_2 = pd.DataFrame(columns=columns_features)  # If no values within the cut datafrmae
            else:
                if (df_cut_tmp.time_s.iloc[-1]-df_cut_tmp.time_s.iloc[0]) < win_length*0.9: 
                    # apply quality control to make sure the segment does not include missing data
                    # If the segment length is less than 90% of the target time window (9 sec in this case), discard this segment. 
                    df_fea_layer_2 = pd.DataFrame(columns=columns_features)
                else:
                    # Write the cut dataframe into a temporal feature dataframe
                    df_fea_layer_2['time'] = df_cut_tmp['time_s']
                    df_fea_layer_2['acc_x'] = df_cut_tmp['la_x']
                    df_fea_layer_2['acc_y'] = df_cut_tmp['la_y']
                    df_fea_layer_2['acc_z'] = df_cut_tmp['la_z']
            
                    # Write others values into the temporal feature dataframe 
                    df_fea_layer_2['subject_id'] = subject_id
                    df_fea_layer_2['device_loc'] = device_loc
                    df_fea_layer_2['act_id'] = act_id
                    df_fea_layer_2['act_name'] = act_name
                    df_fea_layer_2['event_num'] = count
    
                    unique_id_template = subject_id+'_'+device_loc+'_'+str(activity)+'_'+act_name+'_'+str(count)
                    if activity in activities_TP:
                        df_fea_layer_2['walk_or_not'] = 1
                        df_fea_layer_2['unique_id'] = unique_id_template + '_1'
                    elif activity in activities_TN:
                        df_fea_layer_2['walk_or_not'] = 0
                        df_fea_layer_2['unique_id'] = unique_id_template + '_0'
                    
            # Reset dataframe index
            df_fea_layer_2 = df_fea_layer_2.reset_index(drop=True)
    
            # Concat each 10-sec dataframe into a larger dataframe
            df_fea_layer_1 = pd.concat([df_fea_layer_1, df_fea_layer_2])
            df_fea_layer_1 = df_fea_layer_1.reset_index(drop=True)
            
        # Concat all dataframes into one big final dataframe
        df_fea_all = pd.concat([df_fea_all, df_fea_layer_1])
        df_fea_all = df_fea_all.reset_index(drop=True)    
    
    # Save the output dataframe to a parquet format (to save up to 90% of space)
    df_fea_all.to_parquet('../outputs/df_acc/df_acc_la_'+subject_id+'.parquet')
    # df_fea_all.to_csv('../outputs/df_acc/df_acc_la_'+subject_id+'.csv')
    print('Completed converting')
    print('-------------------------------------------------------')


'''
Begin to run parallel computing with joblib
'''
# Get a list of all accelerometer files
all_files = utils.get_all_files()

# For regular computing process
# for file_id in range(len(all_files)):
#     _convert(all_files, file_id)
for file_id in range(22, len(all_files)):
    _convert_df_acc(all_files, file_id)
    
# # Define a function to perform conversion on a single file
# def convert_single_file(all_files, file_id):
#     _convert(all_files, file_id)

# # Define the list of file IDs to process
# file_ids = range(len(all_files))

# # Define the number of parallel jobs
# num_jobs = -1  # Set to -1 to use all available CPU cores, or specify the number of cores

# # Use joblib to parallelize the conversion process
# Parallel(n_jobs=num_jobs)(delayed(convert_single_file)(all_files, file_id) for file_id in file_ids)

Begin to convert acc dataframe for Subject=id4ea159a8
Completed converting
-------------------------------------------------------
Begin to convert acc dataframe for Subject=id7c20ee7a
Completed converting
-------------------------------------------------------
Begin to convert acc dataframe for Subject=idbae5a811
Completed converting
-------------------------------------------------------
Begin to convert acc dataframe for Subject=idf5e3678b
Completed converting
-------------------------------------------------------
Begin to convert acc dataframe for Subject=id8af5374b
Completed converting
-------------------------------------------------------


In [13]:
'''
(1b) Converting all df_fea files 
'''

def _convert_df_fea(acc_file):
    subject_id = acc_file.split('/')[-1].split('.')[0].split('_')[-1]
    print('Begin to compute features dataframe file for Subject '+subject_id)
    
    # Load existing df_acc file
    df = pd.read_parquet(acc_file)
    
    # Create a new dataframe to include features
    columns_features = ['subject_id', 'device_loc', 'act_id', 'act_name', 'event_num', 'walk_or_not', 'unique_id', 'time', 'acc_x', 'acc_y', 'acc_z', 'smv']
    input_data = ['acc_x', 'acc_y', 'acc_z', 'smv']   # Input time series for computing features
    features_types = ['mean', 'std', 'min', 'max']    # Features list to compute
    features_names_for_df = ['fea_' + input_item + '_' + feature for input_item in input_data for feature in features_types]  # feature names used in a new dataframe 
    columns_added_features = columns_features + features_names_for_df
    df_fea_all = pd.DataFrame(columns=columns_features)
        
    def compute_features(df_event):
        
        def _smv(x, y, z):
            '''
            Compute Signal Mangitude Vector from 3-axis acc data
            '''
            return np.sqrt((x*x) + (y*y) + (z*z))
        
        t = df_event['time']
        acc_x = df_event['acc_x']
        acc_y = df_event['acc_y']
        acc_z = df_event['acc_z']
        
        # Samples quality check: interpolate any null-valued samples
        acc_x = utils._interpolate(acc_x)
        acc_y = utils._interpolate(acc_y)
        acc_z = utils._interpolate(acc_z)
        
        smv = _smv(acc_x, acc_y, acc_z)
        df_event = df_event.copy()
        df_event.loc[:, 'smv'] = smv 
        
        # Begin computing features
        for input_cmp in input_data:
            data = df_event[input_cmp]
            for fea_type in features_types:
                if fea_type == 'mean':
                    df_event = df_event.copy()
                    df_event.loc[:, 'fea_'+input_cmp+'_'+fea_type] = np.mean(data)
                elif fea_type == 'std':
                    df_event = df_event.copy()
                    df_event.loc[:, 'fea_'+input_cmp+'_'+fea_type] = np.std(data)
                elif fea_type == 'min':
                    df_event = df_event.copy()
                    df_event.loc[:, 'fea_'+input_cmp+'_'+fea_type] = np.min(data)                
                elif fea_type == 'max':
                    df_event = df_event.copy()
                    df_event.loc[:, 'fea_'+input_cmp+'_'+fea_type] = np.max(data)    
                    
        return df_event
    
    
    # Begin to compute features for all event
    all_unique_events = np.unique(df.unique_id)
    
    for event in all_unique_events:
        df_event = df[df.unique_id==event]  # Cut dataframe into single event
    
        # Compute features
        df_event = compute_features(df_event)
    
        # Concat each event into a larger dataframe
        df_fea_all = pd.concat([df_fea_all, df_event])
        df_fea_all = df_fea_all.reset_index(drop=True)

    # Save the df_fea file to a parquet format (to save up to 90% of space)
    df_fea_all.to_parquet('../outputs/local_df_fea/df_fea_la_'+subject_id+'.parquet')
    print('Completed converting')
    print('-------------------------------------------------------')


# Load all df_acc files
df_acc_path = "../outputs/local_df_acc/" # Directory of raw accelerometer data 
all_acc_files = glob.glob(df_acc_path+'*.parquet') 

# # Run computation in regular for loop
# for acc_file in all_acc_files:
#     print(acc_file)
#     _convert_df_fea(acc_file)

# Use joblib parallel computing 
# Define a function to perform conversion on a single file
def convert_single_file(acc_file):
    _convert_df_fea(acc_file)

# Define the number of parallel jobs
num_jobs = -1  # Set to -1 to use all available CPU cores, or specify the number of cores

# Use joblib to parallelize the conversion process
Parallel(n_jobs=num_jobs)(delayed(convert_single_file)(acc_file) for acc_file in all_acc_files)

Begin to compute features dataframe file for Subject id8af5374b
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id079c763c
Completed converting
-------------------------------------------------------




Begin to compute features dataframe file for Subject id86237981
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id00b70b13
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id5308a7d6
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject idecc9265e
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject idb221f542
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject idf540d82b
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id3e3e50c7
Completed converting
-------------------------------------------------------
Begin to comp

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

Begin to compute features dataframe file for Subject id1165e00c
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id4ea159a8
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id7c20ee7a
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject idc91a49d0
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject idf1ce9a0f
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id37a54bbf
Completed converting
-------------------------------------------------------
Begin to compute features dataframe file for Subject id8e66893c
Completed converting
-------------------------------------------------------
Begin to comp