## Settings

In [1]:
import os
import glob
import pandas as pd
import numpy as np

curfolder = os.getcwd()
# where do we store our processed timeseries data
processedfolder = os.path.join(curfolder, '../01_TS_processing/TS_merged/')
print(processedfolder)
processedfiles = glob.glob(processedfolder + '*.csv')
# get rid of all files that have 'anno' in their name
processedfiles = [x for x in processedfiles if 'anno' not in x]
print(processedfiles)


annofolder_auto = os.path.join(curfolder, '../02_0_TS_movementAnnotation/MT_annotated/')
annofolder_manu = os.path.join(curfolder, '../02_0_TS_movementAnnotation/ManualAnno/R1/')
annofiles_auto = glob.glob(annofolder_auto + '*ELAN_tiers.csv')
print(annofiles_auto)
annofiles_manu = glob.glob(annofolder_manu + '*ELAN_tiers.eaf')
print(annofiles_manu)

mergedfiles = glob.glob(processedfolder + '/merged*.csv')
mergedfiles = [x for x in mergedfiles if 'anno' not in x]

cleanedfolder = os.path.join(curfolder + '/TS_forSampling/')
samplingfiles = glob.glob(cleanedfolder + '*.csv')
print(samplingfiles)

e:\FLESH_ContinuousBodilyEffort\02_1_TS_movementClassification\../01_TS_processing/TS_merged/
['e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_10_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_18_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_2_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_8_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_1_9_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_2_111_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassification\\../01_TS_processing/TS_merged\\merged_0_2_112_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\02_1_TS_movementClassifi

## Create non-overlapping chunks from TS

In [22]:
import pandas as pd
import numpy as np

chunked_folder = os.path.join(curfolder + '/TS_forClassifying/')

# Function to transform dictionary into DataFrame
def dict_to_df(data):
    # Flatten the dictionary into a format with keys like 'feature_mean', 'feature_std', etc.
    flat_data = {}
    for feature, stats in data.items():
        for stat, value in stats.items():
            flat_data[f'{feature}_{stat}'] = value
    # Convert the flat dictionary to a DataFrame with a single row
    df = pd.DataFrame(flat_data, index=[0])
    return df

# Function to summarize every 50 rows in the DataFrame, with start and end time
def summarize_consecutive_rows(df, num_cols, summary_interval=50):
    summaries = []
    # create empty df to store all the rows
    summary_df = pd.DataFrame()

    counter = 1
    for start_idx in range(0, len(df), summary_interval):
        # Select a slice of 50 rows (or fewer for the last chunk)
        selected = df.iloc[start_idx:start_idx + summary_interval]
        summary_stats = {}

        # Calculate statistics for each numerical column
        for col in num_cols:
            stats = selected[col].describe().to_dict()
            summary_stats[col] = stats

        # Convert to DataFrame row format
        summary_row = dict_to_df(summary_stats)
        #summaries.append(summary_row)

        # Add start and end time for the chunk
        summary_row['start_time'] = selected['time'].iloc[0]
        summary_row['end_time'] = selected['time'].iloc[-1]

        # Add chunk number
        summary_row['eventid'] = f"{trialid}_chunk_{counter}"

        # get rid of all columns that contain count in name
        summary_row = summary_row.loc[:, ~summary_row.columns.str.contains('count|%', regex=True)]

        # Append to the main DataFrame
        summary_df = pd.concat([summary_df, summary_row], ignore_index=True)

        counter += 1
        
    # Concatenate all summaries into a single DataFrame
    #return pd.concat(summaries, ignore_index=True)
    return summary_df

# Main DataFrame to store all summaries
dataset_features = pd.DataFrame()

#counter = 1
for file in samplingfiles:
    df = pd.read_csv(file)
    trialid = file.split('\\')[-1].split('.')[0]

    # Define numerical columns (excluding 'time' and 'change' if present)
    num_cols = [col for col in df.select_dtypes(include=np.number).columns if col != 'change' and col != 'time']

    # Summarize data in intervals of 50 rows
    summary_df = summarize_consecutive_rows(df, num_cols, summary_interval=50)

    # Add trial ID and event ID
    summary_df['trialid'] = trialid

    # delete all columns that contain NAs
    summary_df = summary_df.dropna(axis=1, how='all')

    # save it as a csv
    summary_df.to_csv(chunked_folder + trialid + '_chunked.csv', index=False)
    


In [21]:
summary_df

Unnamed: 0,COPc_mean,COPc_std,COPc_min,COPc_max,pelvis_tilt_moment_mean,pelvis_tilt_moment_std,pelvis_tilt_moment_min,pelvis_tilt_moment_max,pelvis_list_moment_mean,pelvis_list_moment_std,...,HeadRankleDistance_y_min,HeadRankleDistance_y_max,HeadRankleDistance_z_mean,HeadRankleDistance_z_std,HeadRankleDistance_z_min,HeadRankleDistance_z_max,start_time,end_time,eventid,trialid
0,0.000319,4.5e-05,0.000185,0.000374,2.823243,2.493456,-2.755942,5.258982,28.172777,0.926994,...,-18.52953,-18.180545,158.007727,0.061283,157.905098,158.110047,0.0,98.0,sampling_dataset_0_1_2_p0_chunk_1,sampling_dataset_0_1_2_p0
1,0.000118,6.6e-05,1.9e-05,0.000223,2.895204,1.907259,-0.835363,5.22293,33.469046,1.28691,...,-18.212594,-18.013578,157.812545,0.042836,157.770362,157.89775,100.0,198.0,sampling_dataset_0_1_2_p0_chunk_2,sampling_dataset_0_1_2_p0
2,0.000141,7.8e-05,2.8e-05,0.000334,-4.858278,1.926899,-7.020259,-1.161564,30.850581,1.331618,...,-18.570472,-18.231503,157.834298,0.036901,157.787253,157.909403,200.0,298.0,sampling_dataset_0_1_2_p0_chunk_3,sampling_dataset_0_1_2_p0
3,0.000219,7.4e-05,9.8e-05,0.000338,-3.138335,2.971652,-6.967406,2.251214,35.981722,3.784587,...,-18.885873,-18.57487,157.963666,0.029165,157.9129,158.022213,300.0,398.0,sampling_dataset_0_1_2_p0_chunk_4,sampling_dataset_0_1_2_p0
4,0.000197,0.000114,1.5e-05,0.000395,3.323793,1.06003,0.726824,4.456312,43.583405,0.624344,...,-18.900031,-18.815315,158.084456,0.026155,158.02839,158.11231,400.0,498.0,sampling_dataset_0_1_2_p0_chunk_5,sampling_dataset_0_1_2_p0
5,0.000336,0.000152,1.8e-05,0.000489,-2.819211,1.44456,-4.123448,0.376315,37.840889,2.647704,...,-19.051894,-18.814193,158.084474,0.022027,158.055122,158.133469,500.0,598.0,sampling_dataset_0_1_2_p0_chunk_6,sampling_dataset_0_1_2_p0
6,0.000229,0.000113,2.5e-05,0.00038,-2.946643,0.391114,-3.810794,-2.594273,32.774572,0.648342,...,-19.181893,-19.055471,158.208765,0.029211,158.137395,158.238322,600.0,698.0,sampling_dataset_0_1_2_p0_chunk_7,sampling_dataset_0_1_2_p0
7,0.000191,6.9e-05,4.5e-05,0.000303,-2.206657,0.562379,-2.772062,-1.074329,36.42647,0.856838,...,-19.667147,-19.18912,158.152917,0.023752,158.125374,158.210485,700.0,798.0,sampling_dataset_0_1_2_p0_chunk_8,sampling_dataset_0_1_2_p0
8,0.000208,0.000108,4.3e-05,0.000388,-1.305907,0.579197,-2.577998,-0.754541,36.714083,0.319982,...,-19.981477,-19.671114,158.033902,0.059628,157.93198,158.124281,800.0,898.0,sampling_dataset_0_1_2_p0_chunk_9,sampling_dataset_0_1_2_p0
9,0.00045,4.1e-05,0.000357,0.000505,-2.757097,0.528329,-3.281886,-1.435788,37.619769,0.715812,...,-20.023929,-19.854825,157.887326,0.015209,157.869993,157.928321,900.0,998.0,sampling_dataset_0_1_2_p0_chunk_10,sampling_dataset_0_1_2_p0


## Create chunks with sliding window

In [10]:
import pandas as pd
import numpy as np
import os

chunked_folder = os.path.join(curfolder + '/TS_forClassifying_SW/')

# Function to transform dictionary into DataFrame
def dict_to_df(data):
    # Flatten the dictionary into a format with keys like 'feature_mean', 'feature_std', etc.
    flat_data = {}
    for feature, stats in data.items():
        for stat, value in stats.items():
            flat_data[f'{feature}_{stat}'] = value
    # Convert the flat dictionary to a DataFrame with a single row
    df = pd.DataFrame(flat_data, index=[0])
    return df

# Function to summarize every 50 rows with overlapping intervals, sliding by 12 rows
def summarize_consecutive_rows(df, num_cols, summary_interval=50, slide_step=12):
    summaries = []
    summary_df = pd.DataFrame()
    counter = 1

    for start_idx in range(0, len(df), slide_step):
        # Select a slice of 50 rows (or fewer for the last chunk)
        selected = df.iloc[start_idx:start_idx + summary_interval]
        
        # Stop if there are no more rows to process
        if selected.empty:
            break
            
        summary_stats = {}

        # Calculate statistics for each numerical column
        for col in num_cols:
            stats = selected[col].describe().to_dict()
            summary_stats[col] = stats

        # Convert to DataFrame row format
        summary_row = dict_to_df(summary_stats)

        # Add start and end time for the chunk
        summary_row['start_time'] = selected['time'].iloc[0]
        summary_row['end_time'] = selected['time'].iloc[-1]

        # Add chunk number
        summary_row['eventid'] = f"{trialid}_chunk_{counter}"

        # Get rid of all columns that contain 'count' or '%' in the name
        summary_row = summary_row.loc[:, ~summary_row.columns.str.contains('count|%', regex=True)]

        # Append to the main DataFrame
        summary_df = pd.concat([summary_df, summary_row], ignore_index=True)

        counter += 1

    return summary_df

# Main DataFrame to store all summaries
dataset_features = pd.DataFrame()

for file in samplingfiles:
    df = pd.read_csv(file)
    trialid = file.split('\\')[-1].split('.')[0]

    # Define numerical columns (excluding 'time' and 'change' if present)
    num_cols = [col for col in df.select_dtypes(include=np.number).columns if col != 'change' and col != 'time']

    # Summarize data in intervals of 50 rows, sliding by 12 rows
    summary_df = summarize_consecutive_rows(df, num_cols, summary_interval=50, slide_step=12)

    # Add trial ID to the DataFrame
    summary_df['trialid'] = trialid

    # Drop columns with all NA values
    summary_df = summary_df.dropna(axis=1, how='all')

    # Save as a CSV file
    summary_df.to_csv(chunked_folder + trialid + '_chunked_SW.csv', index=False)



In [8]:
df

Unnamed: 0,time,COPc,TrialID,pelvis_tilt_moment,pelvis_list_moment,pelvis_rotation_moment,pelvis_tx_force,pelvis_ty_force,pelvis_tz_force,hip_flexion_r_moment,...,LwristLhipDistance_z,LwristRhipDistance_x,LwristRhipDistance_y,LwristRhipDistance_z,HeadRhipDistance_x,HeadRhipDistance_y,HeadRhipDistance_z,HeadRankleDistance_x,HeadRankleDistance_y,HeadRankleDistance_z
0,0.0,0.000185,0_1_2_p0,-2.755942,28.756258,-3.855341,-9.592773,614.805034,-12.038132,-37.863982,...,8.691014,19.582542,-10.039997,8.621602,2.947248,-9.319774,77.742651,-3.442070,-18.529530,158.110047
1,2.0,0.000245,0_1_2_p0,-2.438810,28.622313,-3.443759,-10.480074,615.372662,-11.188778,-38.029734,...,8.685207,19.591826,-10.034142,8.608820,2.948084,-9.313385,77.730512,-3.442533,-18.522946,158.106180
2,4.0,0.000294,0_1_2_p0,-2.121678,28.488367,-3.032176,-11.367376,615.940291,-10.339424,-38.195487,...,8.679401,19.601109,-10.028287,8.596039,2.948921,-9.306996,77.718374,-3.442996,-18.516361,158.102313
3,6.0,0.000329,0_1_2_p0,-1.804546,28.354421,-2.620594,-12.254678,616.507919,-9.490071,-38.361239,...,8.673594,19.610393,-10.022432,8.583257,2.949758,-9.300607,77.706235,-3.443458,-18.509776,158.098446
4,8.0,0.000353,0_1_2_p0,-1.487414,28.220475,-2.209012,-13.141980,617.075548,-8.640717,-38.526991,...,8.667788,19.619676,-10.016577,8.570476,2.950595,-9.294218,77.694096,-3.443921,-18.503192,158.094579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1383,2766.0,0.000442,0_1_2_p0,2.397586,1.364340,5.163053,-1.249049,620.347915,-10.488675,-39.016497,...,-2.525428,31.622065,8.325428,-0.620805,3.045829,-10.888335,74.455483,-3.153954,-21.385561,156.861994
1384,2768.0,0.000558,0_1_2_p0,2.397586,1.364340,5.163053,-1.249049,620.347915,-10.488675,-39.016497,...,-2.525428,31.622065,8.325428,-0.620805,3.045829,-10.888335,74.455483,-3.153954,-21.385561,156.861994
1385,2770.0,0.000696,0_1_2_p0,2.397586,1.364340,5.163053,-1.249049,620.347915,-10.488675,-39.016497,...,-2.525428,31.622065,8.325428,-0.620805,3.045829,-10.888335,74.455483,-3.153954,-21.385561,156.861994
1386,2772.0,0.000857,0_1_2_p0,2.397586,1.364340,5.163053,-1.249049,620.347915,-10.488675,-39.016497,...,-2.525428,31.622065,8.325428,-0.620805,3.045829,-10.888335,74.455483,-3.153954,-21.385561,156.861994


In [None]:
file

In [9]:
summary_df

Unnamed: 0,COPc_mean,COPc_std,COPc_min,COPc_max,pelvis_tilt_moment_mean,pelvis_tilt_moment_std,pelvis_tilt_moment_min,pelvis_tilt_moment_max,pelvis_list_moment_mean,pelvis_list_moment_std,...,HeadRankleDistance_y_min,HeadRankleDistance_y_max,HeadRankleDistance_z_mean,HeadRankleDistance_z_std,HeadRankleDistance_z_min,HeadRankleDistance_z_max,start_time,end_time,eventid,trialid
0,0.000319,0.000045,0.000185,0.000374,2.823243,2.493456,-2.755942,5.258982,28.172777,0.926994,...,-18.529530,-18.180545,158.007727,0.061283,157.905098,158.110047,0.0,98.0,sampling_dataset_0_1_2_p0_chunk_1,sampling_dataset_0_1_2_p0
1,0.000264,0.000103,0.000037,0.000373,4.255155,1.200254,1.061957,5.258982,29.001953,1.691130,...,-18.443930,-18.093430,157.957210,0.061152,157.853487,158.059777,24.0,122.0,sampling_dataset_0_1_2_p0_chunk_2,sampling_dataset_0_1_2_p0
2,0.000215,0.000131,0.000019,0.000373,4.682679,0.519964,3.510468,5.258982,30.458142,2.139742,...,-18.358330,-18.021870,157.906075,0.062859,157.798187,158.009506,48.0,146.0,sampling_dataset_0_1_2_p0_chunk_3,sampling_dataset_0_1_2_p0
3,0.000184,0.000111,0.000019,0.000373,4.251187,1.099776,1.650300,5.258982,32.083622,2.067181,...,-18.272729,-18.013578,157.856345,0.060272,157.770362,157.959235,72.0,170.0,sampling_dataset_0_1_2_p0_chunk_4,sampling_dataset_0_1_2_p0
4,0.000124,0.000070,0.000019,0.000257,3.135539,1.806628,-0.494651,5.258982,33.327615,1.426849,...,-18.192156,-18.013578,157.817484,0.046263,157.770362,157.908965,96.0,194.0,sampling_dataset_0_1_2_p0_chunk_5,sampling_dataset_0_1_2_p0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,0.000124,0.000039,0.000070,0.000263,1.558032,0.477531,1.114530,2.397586,12.902481,7.231593,...,-21.740338,-21.385561,157.071441,0.155520,156.861994,157.344763,2664.0,2762.0,sampling_dataset_0_1_2_p0_chunk_112,sampling_dataset_0_1_2_p0
112,0.000206,0.000205,0.000070,0.001044,1.776158,0.518153,1.120982,2.397586,9.157240,6.784042,...,-21.635724,-21.385561,156.986132,0.117044,156.861994,157.202408,2688.0,2774.0,sampling_dataset_0_1_2_p0_chunk_113,sampling_dataset_0_1_2_p0
113,0.000234,0.000235,0.000070,0.001044,2.003217,0.419953,1.284238,2.397586,5.986522,5.014515,...,-21.542482,-21.385561,156.927562,0.073851,156.861994,157.075526,2712.0,2774.0,sampling_dataset_0_1_2_p0_chunk_114,sampling_dataset_0_1_2_p0
114,0.000279,0.000291,0.000070,0.001044,2.297744,0.164262,1.898377,2.397586,2.481237,1.837547,...,-21.438626,-21.385561,156.876435,0.023760,156.861994,156.934203,2736.0,2774.0,sampling_dataset_0_1_2_p0_chunk_115,sampling_dataset_0_1_2_p0
