# Data preprocessing for sit-to-stand data

## Import libraries

In [17]:
import os
import pandas as pd

## Import labels and specify patient data directory

In [18]:
labs = pd.read_csv('/Users/suhrudp/Library/CloudStorage/OneDrive-Personal/Stats/REMAP Open Dataset PD/21h9f9e30v9cl2fapjggz4q1x7/SitToStand/Data/STS_human_labels/SitToStand_human_labels.csv')

dir = '/Users/suhrudp/Library/CloudStorage/OneDrive-Personal/Stats/REMAP Open Dataset PD/21h9f9e30v9cl2fapjggz4q1x7/SitToStand/Data/STS_2D_skeletons_coarsened'

# Merge all the data

In [19]:
merged_data = pd.DataFrame()

for filename in os.listdir(dir):
    if filename.endswith(".csv") and filename.startswith("Pt"):
        try:
            part_id_str = filename[filename.index('Pt') + 2:filename.index('_')]
            trans_id_str = filename[filename.index('n_') + 2:filename.index('.csv')]
            
            part_id = int(part_id_str)
            trans_id = int(trans_id_str)
        except ValueError:
            print(f"Skipping {filename}: Unable to extract IDs")
            continue

        if any((labs['Participant ID number'] == part_id) & (labs['Transition ID'] == trans_id)):
            column_names = [f'{"x" if i % 2 == 0 else "y"}{i // 2}' for i in range(50)]
            #column_names = ['# frame number', 'time (s)'] + [f'x{i}' for i in range(25)] + [f'y{i}' for i in range(25)]
            data = pd.read_csv(os.path.join(dir, filename), sep=',', header=None, names=column_names, skiprows=1)
            data.columns = column_names
            
            if data.columns[0] != 'frame_number':
                data.rename(columns={data.columns[0]: 'frame_number'}, inplace=True)
            if data.columns[1] != 'time_s':
                data.rename(columns={data.columns[1]: 'time_s'}, inplace=True)

            coordinate_columns = data.columns[2:]
            x_columns = [col for col in coordinate_columns if 'x' in col[:2]]
            y_columns = [col for col in coordinate_columns if 'y' in col[:2]]
            
            new_x_names = [f'x{i+1}' for i in range(len(x_columns))]
            new_y_names = [f'y{i+1}' for i in range(len(y_columns))]
            rename_mapping = dict(zip(x_columns + y_columns, new_x_names + new_y_names))
            
            data.rename(columns=rename_mapping, inplace=True)

            required_columns = ['frame_number', 'time_s'] + new_x_names + new_y_names
            data = data[required_columns]

            data['Participant ID number'] = part_id
            data['Transition ID'] = trans_id

            data = pd.merge(labs, data, on=['Participant ID number', 'Transition ID'])
            
            merged_data = pd.concat([merged_data, data], ignore_index=True)

In [15]:
merged_data

Unnamed: 0,Transition ID,Participant ID number,PD_or_C,sts_whole_episode_duration,sts_final_attempt_duration,On_or_Off_medication,DBS_state,Clinical_assessment,STS_additional_features,MDS-UPDRS_score_3.9 _arising_from_chair,...,y16,y17,y18,y19,y20,y21,y22,y23,y24,y25
0,815,971,C,1.398000,0.856999,Control,Control,Yes,,0.0,...,324,222,316,214,302,222,298,222,302,216
1,815,971,C,1.398000,0.856999,Control,Control,Yes,,0.0,...,324,222,316,214,302,222,298,222,302,216
2,815,971,C,1.398000,0.856999,Control,Control,Yes,,0.0,...,324,222,316,214,302,222,298,222,302,216
3,815,971,C,1.398000,0.856999,Control,Control,Yes,,0.0,...,324,222,316,212,302,222,298,222,302,216
4,815,971,C,1.398000,0.856999,Control,Control,Yes,,0.0,...,324,222,318,212,302,222,298,222,302,214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177557,80,275,PD,2.700999,1.044999,-,On DBS,Yes,,0.0,...,326,306,328,298,300,286,300,284,318,282
177558,80,275,PD,2.700999,1.044999,-,On DBS,Yes,,0.0,...,326,306,328,298,300,286,300,284,318,282
177559,80,275,PD,2.700999,1.044999,-,On DBS,Yes,,0.0,...,326,306,328,298,300,286,300,284,318,282
177560,80,275,PD,2.700999,1.044999,-,On DBS,Yes,,0.0,...,328,306,330,298,312,274,314,274,334,272


## Save the csv

In [20]:
merged_data.to_csv('merged_sts_data.csv')