# Movement annotation III: Preparing data for classification

Now that we trained a well-performing classifier, we will use it to predict the movement/nomovement on our timeseries data. We will chunk the timeseries into 50ms windows with a step of 25 ms. We opt for the step because we want our classifier to be rather accurate about when the movement starts/stops.

In [1]:
import os
import glob
import pandas as pd
import numpy as np

curfolder = os.getcwd()

cleanedfolder = os.path.join(curfolder + '/TS_annotated/')
samplingfiles = glob.glob(cleanedfolder + '*.csv')
print(samplingfiles)

# here we stored the new data
chunked_folder = os.path.join(curfolder + '/TS_forClassifying/')


['e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_1_10_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_1_18_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_1_2_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_1_8_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_1_9_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_2_111_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_2_112_p1.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_1_0_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_anno_0_1_3_p0.csv', 'e:\\FLESH_ContinuousBodilyEffort\\04_TS_movementAnnotation/TS_annotated\\merged_ann

In [7]:
# Function to transform dictionary into DataFrame
def dict_to_df(data):
    # Flatten the dictionary into a format with keys like 'feature_mean', 'feature_std', etc.
    flat_data = {}
    for feature, stats in data.items():
        for stat, value in stats.items():
            flat_data[f'{feature}_{stat}'] = value
    # Convert the flat dictionary to a DataFrame with a single row
    df = pd.DataFrame(flat_data, index=[0])
    return df

# Function to summarize every 50 rows with overlapping intervals, sliding by 12 rows
def summarize_consecutive_rows(df, trialid, num_cols, summary_interval=50, slide_step=12):
    summaries = []
    summary_df = pd.DataFrame()
    counter = 1

    for start_idx in range(0, len(df), slide_step):
        # Select a slice of 50 rows (or fewer for the last chunk)
        selected = df.iloc[start_idx:start_idx + summary_interval]
        
        # Stop if there are no more rows to process
        if selected.empty:
            break
            
        summary_stats = {}

        # Calculate statistics for each numerical column
        for col in num_cols:
            stats = selected[col].describe().to_dict()
            summary_stats[col] = stats

        # Convert to DataFrame row format
        summary_row = dict_to_df(summary_stats)

        # Add start and end time for the chunk
        summary_row['start_time'] = selected['time'].iloc[0]
        summary_row['end_time'] = selected['time'].iloc[-1]

        # Add chunk number
        summary_row['eventid'] = f"{trialid}_chunk_{counter}"

        # Get rid of all columns that contain 'count' or '%' in the name
        summary_row = summary_row.loc[:, ~summary_row.columns.str.contains('count|%', regex=True)]

        # Append to the main DataFrame
        summary_df = pd.concat([summary_df, summary_row], ignore_index=True)

        counter += 1

    return summary_df

In [8]:
# Main DataFrame to store all summaries
dataset_features = pd.DataFrame()

for file in samplingfiles:
    df = pd.read_csv(file)

    # if the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
    if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
        print('skipping ' + file)
        continue
    
    trialid = file.split('\\')[-1].split('.')[0]

    # Define numerical columns (excluding 'time' and 'change' if present)
    num_cols = [col for col in df.select_dtypes(include=np.number).columns if col != 'change' and col != 'time']

    # Summarize data in intervals of 50 rows, sliding by 12 rows
    summary_df = summarize_consecutive_rows(df, trialid, num_cols, summary_interval=50, slide_step=12)

    # Add trial ID to the DataFrame
    summary_df['trialid'] = trialid

    # Drop columns with all NA values
    summary_df = summary_df.dropna(axis=1, how='all')

    # Save as a CSV file
    summary_df.to_csv(chunked_folder + trialid + '_chunked.csv', index=False)

summary_df.head(15)
print('All done, now we can proceed with annotation with our classifier')


skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_10_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_18_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_9_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_0_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_19_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_27_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_28_p1.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_36_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAnnotation/TS_annotated\merged_anno_0_1_37_p0.csv
skipping e:\FLESH_ContinuousBodilyEffort\04_TS_movementAn