# Movement annotation I: Preparing training data

Since we have around 9000 trials in the final dataset, it is not feasible to manually annotate the movement onset and offset for each trial. Instead, we will use a simple logistic regression model to predict the movement onset and offset from all the movement features we have collected into the merge dataset (see XX).

We have annotated the movement onset and offset in ELAN (XX) for a pilot data (dyad 0). Two annotators have independently annotated the movement onset and offset for four tiers:
- upper body
- lower body
- arms
- head

Parent tier 'movement' summarizes overal movement across all tiers.

Now, we will use these ground truth annotations to create a training set for the logistic regression model. 

In [35]:
import os
import glob
import pandas as pd
import numpy as np

curfolder = os.getcwd()
# where do we store our processed timeseries data
processedfolder = os.path.join(curfolder + '\\..\\03_TS_processing\\TS_merged\\')
print(processedfolder)
processedfiles = glob.glob(processedfolder + '*.csv')

annofolder_manu = os.path.join(curfolder + '\\ManualAnno/R1\\')
annofiles_manu = glob.glob(annofolder_manu + '*ELAN_tiers.eaf')


mergedfiles = glob.glob(processedfolder + '/merged*.csv')
mergedfiles = [x for x in mergedfiles if 'anno' not in x]

# here we will store the data
datasetfolder = os.path.join(curfolder + '\\TrainingData\\')

e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\..\03_TS_processing\TS_merged\


Our annotators are annotating only movement, so first we need to also fill in the missing space by nomovement values RUN THIS 

In [36]:
import xml.etree.ElementTree as ET
import glob

def add_nomovement_annotations(xml_file_path, newfilepath):
    # Load the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Extract all time slots
    time_slots = {}
    for time_slot in root.find('TIME_ORDER').findall('TIME_SLOT'):
        time_slots[time_slot.attrib['TIME_SLOT_ID']] = int(time_slot.attrib['TIME_VALUE'])

    # Sort time slots by TIME_VALUE
    sorted_time_slots = sorted(time_slots.items(), key=lambda x: x[1])
    time_slot_ids = [ts[0] for ts in sorted_time_slots]
    time_values = [ts[1] for ts in sorted_time_slots]

    # Loop over all tiers
    for tier in root.findall('TIER'):
        annotations = tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION')

        if not annotations:
            # If no annotations exist, add a single 'nomovement' annotation covering the whole tier
            new_annotation = ET.Element('ANNOTATION')
            alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
            alignable_annotation.set('TIME_SLOT_REF1', time_slot_ids[0])
            alignable_annotation.set('TIME_SLOT_REF2', time_slot_ids[-1])
            annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
            annotation_value.text = 'nomovement'
            tier.append(new_annotation)
        else:
            # Sort annotations by start time
            sorted_annotations = sorted(annotations, key=lambda x: time_slots[x.attrib['TIME_SLOT_REF1']])
            
            # Handle the first annotation
            first_annotation = sorted_annotations[0]
            first_start_time = time_slots[first_annotation.attrib['TIME_SLOT_REF1']]
            if first_start_time > time_values[0]:
                new_annotation = ET.Element('ANNOTATION')
                alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                alignable_annotation.set('TIME_SLOT_REF1', time_slot_ids[0])
                alignable_annotation.set('TIME_SLOT_REF2', first_annotation.attrib['TIME_SLOT_REF1'])
                annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                annotation_value.text = 'nomovement'
                tier.append(new_annotation)

            # Handle gaps between annotations
            for i in range(len(sorted_annotations) - 1):
                current_annotation = sorted_annotations[i]
                next_annotation = sorted_annotations[i + 1]
                current_end_time = time_slots[current_annotation.attrib['TIME_SLOT_REF2']]
                next_start_time = time_slots[next_annotation.attrib['TIME_SLOT_REF1']]
                if current_end_time < next_start_time:
                    new_annotation = ET.Element('ANNOTATION')
                    alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                    alignable_annotation.set('TIME_SLOT_REF1', current_annotation.attrib['TIME_SLOT_REF2'])
                    alignable_annotation.set('TIME_SLOT_REF2', next_annotation.attrib['TIME_SLOT_REF1'])
                    annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                    annotation_value.text = 'nomovement'
                    tier.append(new_annotation)

            # Handle the last annotation
            last_annotation = sorted_annotations[-1]
            last_end_time = time_slots[last_annotation.attrib['TIME_SLOT_REF2']]
            if last_end_time < time_values[-1]:
                new_annotation = ET.Element('ANNOTATION')
                alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                alignable_annotation.set('TIME_SLOT_REF1', last_annotation.attrib['TIME_SLOT_REF2'])
                alignable_annotation.set('TIME_SLOT_REF2', time_slot_ids[-1])
                annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                annotation_value.text = 'nomovement'
                tier.append(new_annotation)

    # Save the modified XML file as a new file
    tree.write(newfilepath, encoding='UTF-8', xml_declaration=True)


In [37]:
manualanno_folder_r1 = curfolder + '/ManualAnno/R1/'            # ola
manualanno_folder_r3 = curfolder + '/ManualAnno/R3/'            # gillian

manualannofiles1 = glob.glob(manualanno_folder_r1 + '/*.eaf')
manualannofiles3 = glob.glob(manualanno_folder_r3 + '/*.eaf')


for file in manualannofiles1:
    print('working on ' + file)

    # new filename is without third part of the name
    newfile = file.split('\\')[-1]
    chunks = newfile.split('_')
    if 'corrected' in file:
        if 'c0' in file or 'c1' in file or 'c2' in file:
            newfile = '_'.join(chunks[:-4])
        else:
            newfile = '_'.join(chunks[:-3])
    else:
        if 'c0' in file or 'c1' in file or 'c2' in file:
            newfile = '_'.join(chunks[:-3])
        else:
            newfile = '_'.join(chunks[:-2]) 

    # replace trial_ with _
    newfile = newfile.replace('trial_', '')
    # add filepath
    newfile = manualanno_folder_r1 + newfile + '_ELAN_tiers.eaf'

    add_nomovement_annotations(file, newfile)
    


working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_11_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_12_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_13_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_14_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_15_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_16_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_17_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_20_p0_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/ManualAnno/R1\0_1_21_p0_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyE

Now, we need to get the manual annotation from ELAN to simple text file

In [38]:
## function to parse elan file

import xml.etree.ElementTree as ET

def parse_eaf_file(eaf_file, rel_tiers):
    tree = ET.parse(eaf_file)
    root = tree.getroot()

    time_order = root.find('TIME_ORDER')
    time_slots = {time_slot.attrib['TIME_SLOT_ID']: time_slot.attrib['TIME_VALUE'] for time_slot in time_order}

    annotations = []
    relevant_tiers = {rel_tiers}
    for tier in root.findall('TIER'):
        tier_id = tier.attrib['TIER_ID']
        if tier_id in relevant_tiers:
            for annotation in tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION'):
                # Ensure required attributes are present
                if 'TIME_SLOT_REF1' in annotation.attrib and 'TIME_SLOT_REF2' in annotation.attrib:
                    ts_ref1 = annotation.attrib['TIME_SLOT_REF1']
                    ts_ref2 = annotation.attrib['TIME_SLOT_REF2']
                    # Get annotation ID if it exists, otherwise set to None
                    ann_id = annotation.attrib.get('ANNOTATION_ID', None)
                    annotation_value = annotation.find('ANNOTATION_VALUE').text.strip()
                    annotations.append({
                        'tier_id': tier_id,
                        'annotation_id': ann_id,
                        'start_time': time_slots[ts_ref1],
                        'end_time': time_slots[ts_ref2],
                        'annotation_value': annotation_value
                    })

    return annotations

## function to load annotations into csv
def fillAnno(TSfile, ANNOfile, colname):
    TSfile[colname] = None
    for index, row in ANNOfile.iterrows():
        start = row[0]
        end = row[1]
        TSfile.loc[(TSfile['time'] >= start) & (TSfile['time'] <= end), colname] = row[2]

    #return TSfile



In [39]:
################
#### arms ######
################

arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'

with open(arms_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'arms')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### upper body####
###################

upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'

with open(upperbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'upper_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### lower body####
###################

lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'

with open(lowerbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'lower_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
##### head ########
###################

head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'

with open(head_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'head_mov')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")


working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_11_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_12_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_13_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_14_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_15_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_16_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_17_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_20_p0_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\ManualAnno/R1\0_1_21_p0_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyE

## Preparing data for classifier

In the following code, we merge the annotations with our merged files (created in XXX) so that we can later sample from the data based on the annotations.

We will now also filter some superfluous information, as well as add some more, such as:

    - distance of LIndex to RIndex
    - distance of Wrist to Hip
    - distance of Head to Hip
    - distance of Head to Ankle

In [40]:
arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'
upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'
lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'
head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'


for file in mergedfiles:
    print('working on ' + file)

    # get trialid
    trialid = file.split('\\')[-1].split('.')[0]
    # replace merged_ with ''
    trialid = trialid.replace('merged_', '')

    # load the merged file
    merged = pd.read_csv(file)
    
    ### txt annotations
    arms = pd.read_csv(arms_anno, sep='\t', header=None)
    ub = pd.read_csv(upperbody_anno, sep='\t', header=None)
    lb = pd.read_csv(lowerbody_anno, sep='\t', header=None)
    head = pd.read_csv(head_anno, sep='\t', header=None)

    annos = [arms, ub, lb, head]

    for anno_df in annos:
        # get the annotations for the trialid
        anno_trial = anno_df[anno_df[3] == trialid] 
        
        if anno_trial.empty:
            print('no annotations for ' + trialid)  # this will be the case of practice trials that were not annotated
            # skip this file and go to the next one
            continue
        
        else:
            if anno_df.equals(arms):
                fillAnno(merged, anno_trial, 'arms')
            elif anno_df.equals(ub):
                fillAnno(merged, anno_trial, 'upper_body')
            elif anno_df.equals(lb):
                fillAnno(merged, anno_trial, 'lower_body')
            elif anno_df.equals(head):
                fillAnno(merged, anno_trial, 'head_mov')
            else:
                print('something went wrong')

    df = merged.copy()

    ## RWrist to LWrist in all dimensions
    df['wristDistance_x'] = df['RWrist_x'] - df['LWrist_x']
    df['wristDistance_y'] = df['RWrist_y'] - df['LWrist_y']
    df['wristDistance_z'] = df['RWrist_z'] - df['LWrist_z']

    ## RWrist to RHip
    df['RwristRhipDistance_x'] = df['RWrist_x'] - df['RHip_x']
    df['RwristRhipDistance_y'] = df['RWrist_y'] - df['RHip_y']
    df['RwristRhipDistance_z'] = df['RWrist_z'] - df['RHip_z']

    ## RWrist to LHip
    df['RwristLhipDistance_x'] = df['RWrist_x'] - df['LHip_x']
    df['RwristLhipDistance_y'] = df['RWrist_y'] - df['LHip_y']
    df['RwristLhipDistance_z'] = df['RWrist_z'] - df['LHip_z']

    ## LWrist to LHip
    df['LwristLhipDistance_x'] = df['LWrist_x'] - df['LHip_x']
    df['LwristLhipDistance_y'] = df['LWrist_y'] - df['LHip_y']
    df['LwristLhipDistance_z'] = df['LWrist_z'] - df['LHip_z']

    ## LWrist to RHip
    df['LwristRhipDistance_x'] = df['LWrist_x'] - df['RHip_x']
    df['LwristRhipDistance_y'] = df['LWrist_y'] - df['RHip_y']
    df['LwristRhipDistance_z'] = df['LWrist_z'] - df['RHip_z']

    ## Head to RHip
    df['HeadRhipDistance_x'] = df['Head_x'] - df['RHip_x']
    df['HeadRhipDistance_y'] = df['Head_y'] - df['RHip_y']
    df['HeadRhipDistance_z'] = df['Head_z'] - df['RHip_z']

    ## Head to RAnkle
    df['HeadRankleDistance_x'] = df['Head_x'] - df['RAnkle_x']
    df['HeadRankleDistance_y'] = df['Head_y'] - df['RAnkle_y']
    df['HeadRankleDistance_z'] = df['Head_z'] - df['RAnkle_z']


    # now let's get rid of columns left_back, right_forward, left_forward, COPXc, COPYc, FileInfo
    df = df.drop(columns=['left_back', 'right_forward', 'right_back', 'left_forward', 'COPXc', 'COPYc', 'FileInfo'])

    # and also all vocal features
    cols = df.columns
    colstodrop = ['envelope', 'loudness', 'roughness', 'flux', 'novelty', 'harmEnergy', 'audio', 'envelope_change', 'audio', 'f0', 'f1', 'f2', 'f3', 'env_']
    newcols = [col for col in cols if not any(x in col for x in colstodrop)]
    df = df[newcols]   
                
    # write to csv 
    df.to_csv(curfolder + '/TS_annotated/merged_anno_' + trialid + '.csv', index=False)




working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\..\03_TS_processing\TS_merged\merged_0_1_44_p0.csv
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\..\03_TS_processing\TS_merged\merged_0_1_0_p0.csv
no annotations for 0_1_0_p0
no annotations for 0_1_0_p0
no annotations for 0_1_0_p0
no annotations for 0_1_0_p0
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\..\03_TS_processing\TS_merged\merged_0_1_10_p1.csv
no annotations for 0_1_10_p1
no annotations for 0_1_10_p1
no annotations for 0_1_10_p1
no annotations for 0_1_10_p1
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\..\03_TS_processing\TS_merged\merged_0_1_11_p1.csv
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\..\03_TS_processing\TS_merged\merged_0_1_12_p1.csv
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation\..\03_TS_processing\TS_merged\merged_0_1_13_p1.csv
working on e:\FLESH_ContinuousBodilyEffort\04_TS_movementAn

Now we are ready to create the training set for the logistic regression model. 

Note that we will not create training data for each tier separately. Sometimes, it may be useful to predict movement of a specific body part with the information about other body part. We will test the robustness of the classifier with different combinations of features in the next step


To make sure that are samples for training make sense in terms of length, let's first check what are the shortest movement and nomovement windows in each tiers

In [41]:
# How many columns is in df
df.shape[1]

586

In [42]:
def find_shortest_chunks(df, column):
    # Identify changes in the 'arms' column
    df['row_change'] = df[column].ne(df[column].shift()).cumsum()

    # Group by the changes and calculate the length of each chunk
    chunk_lengths = df.groupby('row_change').size()

    # Separate movement and no movement chunks
    movement_chunks = chunk_lengths[df[column].groupby(df['row_change']).first() == 'movement']
    no_movement_chunks = chunk_lengths[df[column].groupby(df['row_change']).first() == 'nomovement']

    # Find the shortest chunk for each
    shortest_movement = movement_chunks.min()
    shortest_no_movement = no_movement_chunks.min()

    return shortest_movement, shortest_no_movement

In [43]:
samplingfolder = os.path.join(curfolder + '/TS_annotated/')
samplingfiles = glob.glob(samplingfolder + '*.csv')

arms_shortest_m = []
arms_shortest_nm = []
ub_shortest_m = []
ub_shortest_nm = []
lb_shortest_m = []
lb_shortest_nm = []
head_shortest_m = []
head_shortest_nm = []

for file in samplingfiles:
    df = pd.read_csv(file)

    # if the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
    if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
        print('skipping ' + file)
        continue
    
    short_m_arms, short_nm_arms = find_shortest_chunks(df, 'arms')
    short_m_ub, short_nm_ub = find_shortest_chunks(df, 'upper_body')
    short_m_lb, short_nm_lb = find_shortest_chunks(df, 'lower_body')
    short_m_head, short_nm_head = find_shortest_chunks(df, 'head_mov')

    # append to the lists
    arms_shortest_m.append(short_m_arms)
    arms_shortest_nm.append(short_nm_arms)
    ub_shortest_m.append(short_m_ub)
    ub_shortest_nm.append(short_nm_ub)
    lb_shortest_m.append(short_m_lb)
    lb_shortest_nm.append(short_nm_lb)
    head_shortest_m.append(short_m_head)
    head_shortest_nm.append(short_nm_head)

# get rid of nans in the lists
arms_shortest_m = [x for x in arms_shortest_m if str(x) != 'nan']
arms_shortest_nm = [x for x in arms_shortest_nm if str(x) != 'nan']
ub_shortest_m = [x for x in ub_shortest_m if str(x) != 'nan']
ub_shortest_nm = [x for x in ub_shortest_nm if str(x) != 'nan']
lb_shortest_m = [x for x in lb_shortest_m if str(x) != 'nan']
lb_shortest_nm = [x for x in lb_shortest_nm if str(x) != 'nan']
head_shortest_m = [x for x in head_shortest_m if str(x) != 'nan']
head_shortest_nm = [x for x in head_shortest_nm if str(x) != 'nan']

# what is the shortest chunk for movement and no movement in each list
# ub_m_threshold = min(ub_shortest_m)     # 359
# ub_nm_threshold = min(ub_shortest_nm)   # 84
# lb_m_threshold = min(lb_shortest_m)     # 209
# lb_nm_threshold = min(lb_shortest_nm)   # 8
# head_m_threshold = min(head_shortest_m) # 54
# head_nm_threshold = min(head_shortest_nm) # 20
# arms_m_threshold = min(arms_shortest_m) # 578
# arms_nm_threshold = min(arms_shortest_nm) # 7

# ## putting all to 25 for now so that the window is 50ms
arms_m_threshold = 25
arms_nm_threshold = 25
ub_m_threshold = 25
ub_nm_threshold = 25
lb_m_threshold = 25
lb_nm_threshold = 25
head_m_threshold = 25
head_nm_threshold = 25

print(f'arms movement threshold: {arms_m_threshold}')
print(f'arms no movement threshold: {arms_nm_threshold}')
print(f'upper body movement threshold: {ub_m_threshold}')
print(f'upper body no movement threshold: {ub_nm_threshold}')
print(f'lower body movement threshold: {lb_m_threshold}')
print(f'lower body no movement threshold: {lb_nm_threshold}')
print(f'head movement threshold: {head_m_threshold}')
print(f'head no movement threshold: {head_nm_threshold}')
# and we have 500 Hz sampling rate so each times two
    

    

skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_10_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_18_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_9_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_0_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_19_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_27_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_28_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_36_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_37_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAn

## Sumarizing features for training dataset

Now we will sample windows from movement and nomovement for each tier and summarize the available features in terms of mean, sd, min and max.

We will sample randomly, but also make sure there is enough border cases. Our participants are 'locking' hands in the beginning and in the end of each performance. Our classifier should know that these are not 'communicative' movements per se. 

In [44]:
def select_random_consecutive_rows(df, change_col, threshold):
    # Group the DataFrame by the 'change' column
    grouped = df.groupby(change_col)
    
    # List to hold the selected rows
    selected_rows = []

    # Loop over each group
    for group_key, group_df in grouped:
        # If the group is large enough to select 'threshold' rows
        if len(group_df) >= threshold:
            # Randomly choose a starting index for consecutive selection
            start_idx = np.random.randint(0, len(group_df) - threshold + 1)
            # Select consecutive rows from that start index
            selected = group_df.iloc[start_idx:start_idx + threshold]
            selected_rows.append(selected)
    
    # Concatenate all selected rows into a single DataFrame
    result_df = pd.concat(selected_rows)
    
    return result_df

#Transforming the dictionary into a DataFrame
def dict_to_df(data):
    # Flatten the dictionary into a format with keys like 'COPc_mean', 'pelvis_tilt_moment_std', etc.
    flat_data = {}
    for feature, stats in data.items():
        for stat, value in stats.items():
            flat_data[f'{feature}_{stat}'] = value

    # Convert the flat dictionary to a DataFrame with a single row
    df = pd.DataFrame(flat_data, index=[0])
    
    return df


In [45]:
tiers = ['arms', 'upper_body', 'lower_body', 'head_mov']
threshold_m = 50 # for now, uniform for all
threshold_nm = 50 # for now, uniform for all

for tierofinterest in tiers:
    dataset_features = pd.DataFrame()
    summaries_m = {}
    summaries_nm = {}

    counter = 1

    for file in samplingfiles:
        df = pd.read_csv(file)

        # if the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
        if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
            print('skipping ' + file)
            continue

        # get the trialid
        trialid = file.split('\\')[-1].split('.')[0]

        # annotate unique movement/no movement chunks
        df['row_change'] = df[tierofinterest].ne(df[tierofinterest].shift()).cumsum()

        # sample random 5 samples of the threshold length in both movement and no movement in tier
        tier_m = df[df[tierofinterest] == 'movement']
        tier_nm = df[df[tierofinterest] == 'nomovement']

        # within movement chunks sample 5 samples of the threshold length (if not empty)
        if not tier_m.empty:
            # do it 3times
            for i in range(10):
                tier_m_sample = select_random_consecutive_rows(tier_m, 'row_change', threshold_m)
                # for each numerical column except time, get 
                ## mean, meadian, std, min, max
                # get all cols that are of type numerical
                num_cols = df.select_dtypes(include=np.number).columns
                # get rid of time and change
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                for col in num_cols:
                    # get the statistics and save them to dictionary
                    stats = tier_m_sample[col].describe().to_dict()
                    # save to dict
                    summaries_m[col] = stats

                # meake a row from the dictionary such that the first value for a key is mean, second median, third std, fourth min, fifth max
                summary_row_m = dict_to_df(summaries_m)
                # get rid of all columns that contain count in name
                summary_row_m = summary_row_m.loc[:, ~summary_row_m.columns.str.contains('count|%', regex=True)]

                # add trialid to the row
                summary_row_m['trialid'] = trialid
                # add event id which is trial id + counter
                summary_row_m['eventid'] = trialid + '_mov_' + str(counter)
                # add the annotation from the tier
                summary_row_m['anno_value'] = 'movement'

                # add the row to the dataset_features
                dataset_features = pd.concat([dataset_features, summary_row_m])
                counter += 1
            
        counter = 1

        if not tier_nm.empty:
            # do it 3times
            for i in range(10):
                tier_nm_sample = select_random_consecutive_rows(tier_nm, 'row_change', threshold_nm)
                # for each numerical column except time, get 
                ## mean, meadian, std, min, max
                num_cols = df.select_dtypes(include=np.number).columns
                # get rid of time and change
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                for col in num_cols:
                    # get the statistics and save them to dictionary
                    stats = tier_nm_sample[col].describe().to_dict()
                    summaries_nm[col] = stats

                # meake a row from the dictionary such that the first value for a key is mean, second median, third std, fourth min, fifth max
                # for each column, create new column that will have the old name + _Gmean, _Gstd, _peak_mean, _peak_std, _peak_n
                summary_row_nm = dict_to_df(summaries_nm)
                # get rid of all columns that contain count in name
                summary_row_nm = summary_row_nm.loc[:, ~summary_row_nm.columns.str.contains('count|%', regex=True)]

                # add trialid to the row
                summary_row_nm['trialid'] = trialid
                # add event id which is trial id + counter
                summary_row_nm['eventid'] = trialid + '_nonmov_' + str(counter)
                # add the annotation from tier
                summary_row_nm['anno_value'] = 'nomovement'


                # add the row to the dataset_features
                dataset_features = pd.concat([dataset_features, summary_row_nm])
                counter += 1

        counter = 1

        ###################### Process border windows
        border_rows = []

        # Identify the rows where the tierofinterest changes
        change_points = df[df['row_change'].diff().abs() > 0].index

        for idx in change_points:
            # Get the window before the change
            before_start = max(0, idx - 25)  # Ensure no negative index
            before_end = idx  # Up to the change point
            before_window = df.iloc[before_start:before_end]
            # get the annotation value
            anno_value = df.loc[idx, tierofinterest]

            # Get the window after the change
            after_start = idx
            after_end = min(len(df), idx + 25)  # Ensure no index exceeds the DataFrame length
            after_window = df.iloc[after_start:after_end]

            # Process the 'before' window
            if not before_window.empty:
                num_cols = df.select_dtypes(include=np.number).columns
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                summaries_before = {col: before_window[col].describe().to_dict() for col in num_cols}
                summary_row_before = dict_to_df(summaries_before)
                summary_row_before = summary_row_before.loc[:, ~summary_row_before.columns.str.contains('count|%', regex=True)]
                summary_row_before['trialid'] = trialid
                if anno_value == 'movement':
                    summary_row_before['eventid'] = f"{trialid}_border_mov_{counter}"
                else:
                    summary_row_before['eventid'] = f"{trialid}_border_nonmov_{counter}"
                summary_row_before['anno_value'] = anno_value
                dataset_features = pd.concat([dataset_features, summary_row_before])
                counter += 1

            # Process the 'after' window
            if not after_window.empty:
                summaries_after = {col: after_window[col].describe().to_dict() for col in num_cols}
                summary_row_after = dict_to_df(summaries_after)
                summary_row_after = summary_row_after.loc[:, ~summary_row_after.columns.str.contains('count|%', regex=True)]
                summary_row_after['trialid'] = trialid
                if anno_value == 'movement':
                    summary_row_after['eventid'] = f"{trialid}_border_mov_{counter}"
                else:
                    summary_row_after['eventid'] = f"{trialid}_border_nonmov_{counter}"
                summary_row_after['anno_value'] = anno_value
                dataset_features = pd.concat([dataset_features, summary_row_after])
                counter += 1

    # drop all columns that contain NAs
    dataset_features = dataset_features.dropna(axis=1)
    # save the dataset_features to csv
    filename = '\\dataset_' + tierofinterest + '_features.csv'
    dataset_features.to_csv(datasetfolder + filename, index=False)
        
        

skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_10_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_18_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_9_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_0_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_19_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_27_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_28_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_36_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_37_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAn