In [1]:
import os
import glob
import pandas as pd
import numpy as np

curfolder = os.getcwd()
# where do we store our processed timeseries data
processedfolder = os.path.join(curfolder, '../01_TS_processing/TS_merged/')
print(processedfolder)
processedfiles = glob.glob(processedfolder + '*.csv')
# get rid of all files that have 'anno' in their name
processedfiles = [x for x in processedfiles if 'anno' not in x]
print(processedfiles)


annofolder_auto = os.path.join(curfolder, '../02_0_TS_movementAnnotation/MT_annotated/')
annofolder_manu = os.path.join(curfolder, '../02_0_TS_movementAnnotation/ManualAnno/R1/')
annofiles_auto = glob.glob(annofolder_auto + '*ELAN_tiers.csv')
print(annofiles_auto)
annofiles_manu = glob.glob(annofolder_manu + '*ELAN_tiers.eaf')
print(annofiles_manu)

mergedfiles = glob.glob(processedfolder + '/merged*.csv')
mergedfiles = [x for x in mergedfiles if 'anno' not in x]


e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../01_TS_processing/TS_merged/
['e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_10_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_18_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_2_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_8_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_9_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_2_111_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_2_112_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassifi

# Getting ground truth from ELAN

## Functions

In [2]:
## function to parse elan file

import xml.etree.ElementTree as ET

def parse_eaf_file(eaf_file, rel_tiers):
    tree = ET.parse(eaf_file)
    root = tree.getroot()

    time_order = root.find('TIME_ORDER')
    time_slots = {time_slot.attrib['TIME_SLOT_ID']: time_slot.attrib['TIME_VALUE'] for time_slot in time_order}

    annotations = []
    relevant_tiers = {rel_tiers}
    for tier in root.findall('TIER'):
        tier_id = tier.attrib['TIER_ID']
        if tier_id in relevant_tiers:
            for annotation in tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION'):
                # Ensure required attributes are present
                if 'TIME_SLOT_REF1' in annotation.attrib and 'TIME_SLOT_REF2' in annotation.attrib:
                    ts_ref1 = annotation.attrib['TIME_SLOT_REF1']
                    ts_ref2 = annotation.attrib['TIME_SLOT_REF2']
                    # Get annotation ID if it exists, otherwise set to None
                    ann_id = annotation.attrib.get('ANNOTATION_ID', None)
                    annotation_value = annotation.find('ANNOTATION_VALUE').text.strip()
                    annotations.append({
                        'tier_id': tier_id,
                        'annotation_id': ann_id,
                        'start_time': time_slots[ts_ref1],
                        'end_time': time_slots[ts_ref2],
                        'annotation_value': annotation_value
                    })

    return annotations

## function to load annotations into csv

def fillAnno(TSfile, ANNOfile, colname):
    TSfile[colname] = 0
    for index, row in ANNOfile.iterrows():
        start = row[0]
        end = row[1]
        TSfile.loc[(TSfile['time'] >= start) & (TSfile['time'] <= end), colname] = row[2]



## Get all annotations from ELAN, separately for tiers

In [3]:
################
#### arms ######
################

arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'

with open(arms_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'arms')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### upper body####
###################

upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'

with open(upperbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'upper_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### lower body####
###################

lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'

with open(lowerbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'lower_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
##### head ########
###################

head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'

with open(head_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'head_mov')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")


working on e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../02_0_TS_movementAnnotation/ManualAnno/R1\0_1_11_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../02_0_TS_movementAnnotation/ManualAnno/R1\0_1_12_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../02_0_TS_movementAnnotation/ManualAnno/R1\0_1_13_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../02_0_TS_movementAnnotation/ManualAnno/R1\0_1_14_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../02_0_TS_movementAnnotation/ManualAnno/R1\0_1_15_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../02_0_TS_movementAnnotation/ManualAnno/R1\0_1_16_p1_ELAN_tiers.eaf
working on e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../02_0_TS_movementAnnotation/ManualAnno/R1\0_1_17_p1_ELAN_tiers.eaf

# Merge manual annotations with merged TS

In [None]:
arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'
upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'
lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'
head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'


for file in mergedfiles:
    print('working on ' + file)

    # get trialid
    trialid = file.split('\\')[-1].split('.')[0]
    # replace merged_ with ''
    trialid = trialid.replace('merged_', '')

    # load the merged file
    merged = pd.read_csv(file)
    
    ### txt annotations
    arms = pd.read_csv(arms_anno, sep='\t', header=None)
    ub = pd.read_csv(upperbody_anno, sep='\t', header=None)
    lb = pd.read_csv(lowerbody_anno, sep='\t', header=None)
    head = pd.read_csv(head_anno, sep='\t', header=None)

    annos = [arms, ub, lb, head]

    for anno_df in annos:
        # get the annotations for the trialid
        anno_trial = anno_df[anno_df[3] == trialid] 
        
        if anno_trial.empty:
            print('no annotations for ' + trialid)
            # skip this file and go to the next one
            continue
        
        else:
            if anno_df.equals(arms):
                fillAnno(merged, anno_trial, 'arms')
            elif anno_df.equals(ub):
                fillAnno(merged, anno_trial, 'upper_body')
            elif anno_df.equals(lb):
                fillAnno(merged, anno_trial, 'lower_body')
            elif anno_df.equals(head):
                fillAnno(merged, anno_trial, 'head_mov')
            else:
                print('something went wrong')
                
    # write to csv 
    merged.to_csv(curfolder + '/TS_annotated/merged_anno_' + trialid + '.csv', index=False)


Now we are ready with merged files that have manual annotations included

What we need to do now is

- differentiate what columns belong to upper_body, lower_body, head, arms

Note: let's not do this, we will use the fact that we have so many (related) information, so maybe sometimes information about leg position can help the clasifier to pick up something about movement of the arms

But we will filter some superfluous information, as well as add some more, such as:

    - distance of LIndex to RIndex
    - distance of Wrist to Hip
    - distance of Head to Hip
    - distance of Head to Ankle

- check what is the smallest window of movement for each tiertype
- from each ID sample number of windows of movement and nomovement and collect summary features for it
- write it into df

# Preparing training dataset for classifier

In [13]:
df

Unnamed: 0,time,COPc,TrialID,pelvis_tilt_moment,pelvis_list_moment,pelvis_rotation_moment,pelvis_tx_force,pelvis_ty_force,pelvis_tz_force,hip_flexion_r_moment,...,LwristLhipDistance_z,LwristRhipDistance_x,LwristRhipDistance_y,LwristRhipDistance_z,HeadRhipDistance_x,HeadRhipDistance_y,HeadRhipDistance_z,HeadRankleDistance_x,HeadRankleDistance_y,HeadRankleDistance_z
0,0.0,0.000635,0_1_12_p1,1.232381,14.043430,0.169657,-11.649036,589.233743,-16.732436,-32.555904,...,5.605551,15.795462,-10.977222,5.699641,4.133536,-5.837644,78.499740,-3.264186,-18.862371,156.666344
1,2.0,0.000632,0_1_12_p1,1.331012,14.826076,0.212331,-10.313432,592.372203,-15.365361,-32.856213,...,5.607527,15.792480,-10.975537,5.705084,4.140546,-5.836454,78.502369,-3.256787,-18.861728,156.667036
2,4.0,0.000626,0_1_12_p1,1.429644,15.608721,0.255005,-8.977827,595.510663,-13.998286,-33.156522,...,5.609503,15.789498,-10.973851,5.710527,4.147556,-5.835265,78.504997,-3.249388,-18.861086,156.667729
3,6.0,0.000614,0_1_12_p1,1.528275,16.391367,0.297679,-7.642222,598.649123,-12.631211,-33.456831,...,5.611480,15.786516,-10.972165,5.715971,4.154566,-5.834075,78.507626,-3.241989,-18.860443,156.668421
4,8.0,0.000596,0_1_12_p1,1.626907,17.174013,0.340353,-6.306617,601.787583,-11.264136,-33.757140,...,5.613456,15.783534,-10.970479,5.721414,4.161577,-5.832885,78.510255,-3.234591,-18.859800,156.669113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1809,3618.0,0.000268,0_1_12_p1,-17.575320,-87.222788,-7.166366,80.705533,505.123435,-66.811931,-5.284973,...,-7.604024,31.206756,-9.101513,-6.734459,4.372422,-11.798296,71.079548,-2.820380,-24.382434,155.364175
1810,3620.0,0.000241,0_1_12_p1,-18.583172,-92.678737,-7.185806,79.024175,507.701598,-65.344818,-3.394597,...,-7.746186,31.398413,-9.036872,-6.874027,4.378207,-11.816154,71.005300,-2.813188,-24.398653,155.356116
1811,3622.0,0.000231,0_1_12_p1,-19.591025,-98.134687,-7.205246,77.342818,510.279761,-63.877705,-1.504221,...,-7.888348,31.590071,-8.972232,-7.013594,4.383993,-11.834013,70.931052,-2.805997,-24.414872,155.348057
1812,3624.0,0.000258,0_1_12_p1,-20.598878,-103.590636,-7.224687,75.661461,512.857925,-62.410591,0.386155,...,-8.030510,31.781728,-8.907591,-7.153162,4.389778,-11.851872,70.856804,-2.798805,-24.431090,155.339997


In [15]:
# list all the annotated files
files = glob.glob(curfolder + '/TS_annotated/*.csv')
cleanedfolder = os.path.join(curfolder + '/TS_forSampling/')

# let's do some cleaning and add some distances between the body parts and resave it

for file in files:

    df = pd.read_csv(file)

    # if the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
    if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
        print('skipping ' + file)
        continue

    filename = file.split('\\')[-1]
    # substitute merged_anno with sampling_dataset
    filename = filename.replace('merged_anno_', 'sampling_dataset_')


    ## RWrist to LWrist in all dimensions
    df['wristDistance_x'] = df['RWrist_x'] - df['LWrist_x']
    df['wristDistance_y'] = df['RWrist_y'] - df['LWrist_y']
    df['wristDistance_z'] = df['RWrist_z'] - df['LWrist_z']

    ## RWrist to RHip
    df['RwristRhipDistance_x'] = df['RWrist_x'] - df['RHip_x']
    df['RwristRhipDistance_y'] = df['RWrist_y'] - df['RHip_y']
    df['RwristRhipDistance_z'] = df['RWrist_z'] - df['RHip_z']

    ## RWrist to LHip
    df['RwristLhipDistance_x'] = df['RWrist_x'] - df['LHip_x']
    df['RwristLhipDistance_y'] = df['RWrist_y'] - df['LHip_y']
    df['RwristLhipDistance_z'] = df['RWrist_z'] - df['LHip_z']

    ## LWrist to LHip
    df['LwristLhipDistance_x'] = df['LWrist_x'] - df['LHip_x']
    df['LwristLhipDistance_y'] = df['LWrist_y'] - df['LHip_y']
    df['LwristLhipDistance_z'] = df['LWrist_z'] - df['LHip_z']

    ## LWrist to RHip
    df['LwristRhipDistance_x'] = df['LWrist_x'] - df['RHip_x']
    df['LwristRhipDistance_y'] = df['LWrist_y'] - df['RHip_y']
    df['LwristRhipDistance_z'] = df['LWrist_z'] - df['RHip_z']

    ## Head to RHip
    df['HeadRhipDistance_x'] = df['Head_x'] - df['RHip_x']
    df['HeadRhipDistance_y'] = df['Head_y'] - df['RHip_y']
    df['HeadRhipDistance_z'] = df['Head_z'] - df['RHip_z']

    ## Head to RAnkle
    df['HeadRankleDistance_x'] = df['Head_x'] - df['RAnkle_x']
    df['HeadRankleDistance_y'] = df['Head_y'] - df['RAnkle_y']
    df['HeadRankleDistance_z'] = df['Head_z'] - df['RAnkle_z']


    # now let's get rid of columns left_back, right_forward, left_forward, COPXc, COPYc, FileInfo
    df = df.drop(columns=['left_back', 'right_forward', 'right_back', 'left_forward', 'COPXc', 'COPYc', 'FileInfo'])

    # and also all vocal features
    try:
        df = df.drop(columns=['envelope', 'loudness', 'roughness', 'flux', 'novelty', 'harmEnergy', 'audio', 'envelope_change'])
    except:
        # if there is KeyError, just pass because probably these columns are not there
        pass

    ## and all columns that contains f0_, f1_, f2_, f3_
    df = df[df.columns.drop(list(df.filter(regex='f0')))]
    df = df[df.columns.drop(list(df.filter(regex='f1')))]
    df = df[df.columns.drop(list(df.filter(regex='f2')))]
    df = df[df.columns.drop(list(df.filter(regex='f3')))]
    df = df[df.columns.drop(list(df.filter(regex='env_')))]

    # save the file
    df.to_csv(cleanedfolder + filename, index=False)



skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_10_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_18_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_9_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_0_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_19_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_27_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_28_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_36_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification/TS_annotated\merged_anno_0_1_37_p0.csv
ski

In [16]:
df

Unnamed: 0,time,COPc,TrialID,pelvis_tilt_moment,pelvis_list_moment,pelvis_rotation_moment,pelvis_tx_force,pelvis_ty_force,pelvis_tz_force,hip_flexion_r_moment,...,LwristLhipDistance_z,LwristRhipDistance_x,LwristRhipDistance_y,LwristRhipDistance_z,HeadRhipDistance_x,HeadRhipDistance_y,HeadRhipDistance_z,HeadRankleDistance_x,HeadRankleDistance_y,HeadRankleDistance_z
0,0.0,0.000307,0_2_42_p0,8.417126,36.476243,6.348210,6.237800,621.412493,-1.585614,-45.755223,...,5.926073,19.120102,-14.147603,6.484106,3.406576,-11.751758,78.612128,-1.602418,-22.699367,162.627185
1,2.0,0.000309,0_2_42_p0,8.186337,37.250026,6.154978,4.660115,622.389909,-1.677097,-45.863157,...,5.927541,19.119946,-14.149237,6.486387,3.401745,-11.757641,78.613286,-1.604992,-22.708014,162.625413
2,4.0,0.000290,0_2_42_p0,7.955548,38.023808,5.961747,3.082431,623.367326,-1.768580,-45.971090,...,5.929010,19.119791,-14.150871,6.488668,3.396913,-11.763524,78.614444,-1.607567,-22.716662,162.623641
3,6.0,0.000255,0_2_42_p0,7.724760,38.797591,5.768515,1.504746,624.344742,-1.860062,-46.079024,...,5.930479,19.119635,-14.152505,6.490949,3.392082,-11.769408,78.615602,-1.610141,-22.725310,162.621869
4,8.0,0.000208,0_2_42_p0,7.493971,39.571374,5.575284,-0.072939,625.322158,-1.951545,-46.186957,...,5.931948,19.119480,-14.154139,6.493229,3.387250,-11.775291,78.616760,-1.612715,-22.733958,162.620098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3270,6540.0,0.004072,0_2_42_p0,7.405464,22.466978,7.508989,1.932900,614.043281,52.233366,-38.728811,...,6.842968,17.821453,-15.870489,7.256474,4.706797,-10.214627,79.110050,-4.149295,-19.173668,161.138548
3271,6542.0,0.003980,0_2_42_p0,7.413452,22.493583,7.439016,2.223025,614.746821,50.955431,-38.812838,...,6.844106,17.780876,-15.847088,7.255743,4.700601,-10.181945,79.135707,-4.155030,-19.152266,161.136566
3272,6544.0,0.003848,0_2_42_p0,7.421441,22.520189,7.369044,2.513150,615.450361,49.677495,-38.896864,...,6.845243,17.740300,-15.823688,7.255012,4.694404,-10.149264,79.161364,-4.160764,-19.130865,161.134584
3273,6546.0,0.003675,0_2_42_p0,7.429430,22.546794,7.299071,2.803276,616.153901,48.399560,-38.980891,...,6.846380,17.699724,-15.800287,7.254281,4.688207,-10.116582,79.187021,-4.166499,-19.109464,161.132602


## Checking for shortest window of movement/nomovement

In [3]:
cleanedfolder = os.path.join(curfolder + '/TS_forSampling/')
samplingfiles = glob.glob(cleanedfolder + '*.csv')

arms_shortest_m = []
arms_shortest_nm = []
ub_shortest_m = []
ub_shortest_nm = []
lb_shortest_m = []
lb_shortest_nm = []
head_shortest_m = []
head_shortest_nm = []

def find_shortest_chunks(df, column):
    # Identify changes in the 'arms' column
    df['change'] = df[column].ne(df[column].shift()).cumsum()

    # Group by the changes and calculate the length of each chunk
    chunk_lengths = df.groupby('change').size()

    # Separate movement and no movement chunks
    movement_chunks = chunk_lengths[df[column].groupby(df['change']).first() == 'movement']
    no_movement_chunks = chunk_lengths[df[column].groupby(df['change']).first() == 'nomovement']

    # Find the shortest chunk for each
    shortest_movement = movement_chunks.min()
    shortest_no_movement = no_movement_chunks.min()

    return shortest_movement, shortest_no_movement

for file in samplingfiles:
    df = pd.read_csv(file)
    short_m_arms, short_nm_arms = find_shortest_chunks(df, 'arms')
    short_m_ub, short_nm_ub = find_shortest_chunks(df, 'upper_body')
    short_m_lb, short_nm_lb = find_shortest_chunks(df, 'lower_body')
    short_m_head, short_nm_head = find_shortest_chunks(df, 'head_mov')

    # append to the lists
    arms_shortest_m.append(short_m_arms)
    arms_shortest_nm.append(short_nm_arms)
    ub_shortest_m.append(short_m_ub)
    ub_shortest_nm.append(short_nm_ub)
    lb_shortest_m.append(short_m_lb)
    lb_shortest_nm.append(short_nm_lb)
    head_shortest_m.append(short_m_head)
    head_shortest_nm.append(short_nm_head)

# get rid of nans in the lists
arms_shortest_m = [x for x in arms_shortest_m if str(x) != 'nan']
arms_shortest_nm = [x for x in arms_shortest_nm if str(x) != 'nan']
ub_shortest_m = [x for x in ub_shortest_m if str(x) != 'nan']
ub_shortest_nm = [x for x in ub_shortest_nm if str(x) != 'nan']
lb_shortest_m = [x for x in lb_shortest_m if str(x) != 'nan']
lb_shortest_nm = [x for x in lb_shortest_nm if str(x) != 'nan']
head_shortest_m = [x for x in head_shortest_m if str(x) != 'nan']
head_shortest_nm = [x for x in head_shortest_nm if str(x) != 'nan']
    

    

## Setting thresholds

In [13]:
# what is the shortest chunk for movement and no movement in each list
# ub_m_threshold = min(ub_shortest_m)     # 359
# ub_nm_threshold = min(ub_shortest_nm)   # 84
# lb_m_threshold = min(lb_shortest_m)     # 209
# lb_nm_threshold = min(lb_shortest_nm)   # 8
# head_m_threshold = min(head_shortest_m) # 54
# head_nm_threshold = min(head_shortest_nm) # 20

## putting all to 50 for now so that the window is 100ms
arms_m_threshold = 50
arms_nm_threshold = 50
ub_m_threshold = 50
ub_nm_threshold = 50
lb_m_threshold = 50
lb_nm_threshold = 50
head_m_threshold = 50
head_nm_threshold = 50

print(f'arms movement threshold: {arms_m_threshold}')
print(f'arms no movement threshold: {arms_nm_threshold}')
print(f'upper body movement threshold: {ub_m_threshold}')
print(f'upper body no movement threshold: {ub_nm_threshold}')
print(f'lower body movement threshold: {lb_m_threshold}')
print(f'lower body no movement threshold: {lb_nm_threshold}')
print(f'head movement threshold: {head_m_threshold}')
print(f'head no movement threshold: {head_nm_threshold}')
# and we have 500 Hz sampling rate so each times two

arms movement threshold: 50
arms no movement threshold: 50
upper body movement threshold: 50
upper body no movement threshold: 50
lower body movement threshold: 50
lower body no movement threshold: 50
head movement threshold: 50
head no movement threshold: 50


## Collecting summary features

In [22]:
def select_random_consecutive_rows(df, change_col, threshold):
    # Group the DataFrame by the 'change' column
    grouped = df.groupby(change_col)
    
    # List to hold the selected rows
    selected_rows = []

    # Loop over each group
    for group_key, group_df in grouped:
        # If the group is large enough to select 'threshold' rows
        if len(group_df) >= threshold:
            # Randomly choose a starting index for consecutive selection
            start_idx = np.random.randint(0, len(group_df) - threshold + 1)
            # Select consecutive rows from that start index
            selected = group_df.iloc[start_idx:start_idx + threshold]
            selected_rows.append(selected)
    
    # Concatenate all selected rows into a single DataFrame
    result_df = pd.concat(selected_rows)
    
    return result_df

#Transforming the dictionary into a DataFrame
def dict_to_df(data):
    # Flatten the dictionary into a format with keys like 'COPc_mean', 'pelvis_tilt_moment_std', etc.
    flat_data = {}
    for feature, stats in data.items():
        for stat, value in stats.items():
            flat_data[f'{feature}_{stat}'] = value

    # Convert the flat dictionary to a DataFrame with a single row
    df = pd.DataFrame(flat_data, index=[0])
    
    return df

# now take a sample, and sample 5 samples of the threshold length in both movement and nomovement and save it to new df

dataset_features = pd.DataFrame()

##########################################
## adjust based on what you want to use
tierofinterest = 'head_mov' 
threshold_m = head_m_threshold
threshold_nm = head_nm_threshold

###########################################

summaries_m = {}
summaries_nm = {}

counter = 1

for file in samplingfiles[2:]:
    df = pd.read_csv(file)
    # get the trialid
    trialid = file.split('\\')[-1].split('.')[0]

    # annotate unique movement/no movement chunks
    df['change'] = df[tierofinterest].ne(df[tierofinterest].shift()).cumsum()

    # sample random 5 samples of the threshold length in both movement and no movement in tier
    tier_m = df[df[tierofinterest] == 'movement']
    tier_nm = df[df[tierofinterest] == 'nomovement']

    # within movement chunks sample 5 samples of the threshold length (if not empty)
    if not tier_m.empty:
        # do it 3times
        for i in range(10):
            tier_m_sample = select_random_consecutive_rows(tier_m, 'change', threshold_m)
            # for each numerical column except time, get 
            ## mean, meadian, std, min, max
            # get all cols that are of type numerical
            num_cols = df.select_dtypes(include=np.number).columns
            # get rid of time and change
            num_cols = [x for x in num_cols if x not in ['time', 'change']]
            for col in num_cols:
                # get the statistics and save them to dictionary
                stats = tier_m_sample[col].describe().to_dict()
                # save to dict
                summaries_m[col] = stats

            # meake a row from the dictionary such that the first value for a key is mean, second median, third std, fourth min, fifth max
            summary_row_m = dict_to_df(summaries_m)
            # get rid of all columns that contain count in name
            summary_row_m = summary_row_m.loc[:, ~summary_row_m.columns.str.contains('count|%', regex=True)]

            # add trialid to the row
            summary_row_m['trialid'] = trialid
            # add event id which is trial id + counter
            summary_row_m['eventid'] = trialid + '_mov_' + str(counter)
            # add the annotation from the tier
            summary_row_m['anno_value'] = 'movement'

            # add the row to the dataset_features
            dataset_features = pd.concat([dataset_features, summary_row_m])
            counter += 1
        
    counter = 1

    if not tier_nm.empty:
        # do it 3times
        for i in range(10):
            tier_nm_sample = select_random_consecutive_rows(tier_nm, 'change', threshold_nm)
            # for each numerical column except time, get 
            ## mean, meadian, std, min, max
            # get all cols that are of type numerical
            num_cols = df.select_dtypes(include=np.number).columns
            # get rid of time and change
            num_cols = [x for x in num_cols if x not in ['time', 'change']]
            for col in num_cols:
                # get the statistics and save them to dictionary
                stats = tier_nm_sample[col].describe().to_dict()
                # save to dict
                summaries_nm[col] = stats

            # meake a row from the dictionary such that the first value for a key is mean, second median, third std, fourth min, fifth max
            # for each column, create new column that will have the old name + _Gmean, _Gstd, _peak_mean, _peak_std, _peak_n
            summary_row_nm = dict_to_df(summaries_nm)
            # get rid of all columns that contain count in name
            summary_row_nm = summary_row_nm.loc[:, ~summary_row_nm.columns.str.contains('count|%', regex=True)]

            # add trialid to the row
            summary_row_nm['trialid'] = trialid
            # add event id which is trial id + counter
            summary_row_nm['eventid'] = trialid + '_nonmov_' + str(counter)
            # add the annotation from tier
            summary_row_nm['anno_value'] = 'nomovement'


            # add the row to the dataset_features
            dataset_features = pd.concat([dataset_features, summary_row_nm])
            counter += 1

    

In [23]:
dataset_features

Unnamed: 0,COPc_mean,COPc_std,COPc_min,COPc_max,pelvis_tilt_moment_mean,pelvis_tilt_moment_std,pelvis_tilt_moment_min,pelvis_tilt_moment_max,pelvis_list_moment_mean,pelvis_list_moment_std,...,eventid,anno_value,audio_mean,audio_std,audio_min,audio_max,envelope_mean,envelope_std,envelope_min,envelope_max
0,0.001052,0.000257,0.000321,0.001465,12.045745,0.449693,11.711965,13.313466,18.897417,2.350771,...,sampling_dataset_0_2_111_p1_mov_1,movement,,,,,,,,
0,0.001051,0.000298,0.000282,0.001465,11.706528,0.151689,11.296582,11.906105,15.640763,3.576733,...,sampling_dataset_0_2_111_p1_mov_2,movement,,,,,,,,
0,0.002942,0.000507,0.002014,0.003473,4.100830,0.925605,2.607417,5.357835,-34.685448,1.878721,...,sampling_dataset_0_2_111_p1_mov_3,movement,,,,,,,,
0,0.001329,0.000259,0.000928,0.001668,9.613833,1.036393,6.927315,10.689407,38.467419,10.877880,...,sampling_dataset_0_2_111_p1_mov_4,movement,,,,,,,,
0,0.000917,0.000250,0.000321,0.001472,13.400268,1.387308,11.711965,15.815854,23.536020,3.683240,...,sampling_dataset_0_2_111_p1_mov_5,movement,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.003048,0.002148,0.000321,0.008717,4.539343,3.528172,-1.484371,10.546761,37.598383,10.136962,...,sampling_dataset_0_2_42_p0_nonmov_6,nomovement,-0.002854,0.018685,-0.078308,0.085754,0.347145,0.230619,0.093991,0.866973
0,0.004910,0.003098,0.001084,0.009583,10.229566,3.654883,5.946262,16.402915,42.493664,9.347206,...,sampling_dataset_0_2_42_p0_nonmov_7,nomovement,-0.002854,0.018685,-0.078308,0.085754,0.347145,0.230619,0.093991,0.866973
0,0.002591,0.002085,0.000321,0.008569,6.280573,3.976910,-1.484371,14.702082,54.094482,10.196054,...,sampling_dataset_0_2_42_p0_nonmov_8,nomovement,-0.002854,0.018685,-0.078308,0.085754,0.347145,0.230619,0.093991,0.866973
0,0.004643,0.003483,0.000828,0.009583,10.682936,3.478008,6.520597,16.402915,45.329684,8.996132,...,sampling_dataset_0_2_42_p0_nonmov_9,nomovement,-0.002854,0.018685,-0.078308,0.085754,0.347145,0.230619,0.093991,0.866973


In [24]:
# drop all columns that contain NAs
dataset_features = dataset_features.dropna(axis=1)

In [25]:
# save dataset
dataset_features.to_csv(curfolder + '/dataset_features_head.csv', index=False)