# Data Science Technical Assessment
### Sydney Kuhl

In [25]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# make sure to pip install xgboost

In [2]:
GRF_COP_AP_RAW_right = pd.read_csv(r'GRF_COP_AP_RAW_right.csv')
GRF_COP_AP_RAW_left = pd.read_csv(r'GRF_COP_AP_RAW_left.csv')

In [3]:
GRF_COP_AP_RAW_right.head()

Unnamed: 0,SUBJECT_ID,SESSION_ID,TRIAL_ID,COP_AP_RAW_1,COP_AP_RAW_2,COP_AP_RAW_3,COP_AP_RAW_4,COP_AP_RAW_5,COP_AP_RAW_6,COP_AP_RAW_7,...,COP_AP_RAW_396,COP_AP_RAW_397,COP_AP_RAW_398,COP_AP_RAW_399,COP_AP_RAW_400,COP_AP_RAW_401,COP_AP_RAW_402,COP_AP_RAW_403,COP_AP_RAW_404,COP_AP_RAW_405
0,510,413,1,-0.162955,-0.154802,-0.157569,-0.156323,-0.153143,-0.149204,-0.147165,...,,,,,,,,,,
1,510,413,2,-0.132743,-0.143059,-0.146384,-0.14309,-0.140517,-0.138094,-0.135666,...,,,,,,,,,,
2,510,413,3,-0.204854,-0.207629,-0.21593,-0.211435,-0.207549,-0.20631,-0.204622,...,,,,,,,,,,
3,510,413,4,-0.225304,-0.203526,-0.203677,-0.200858,-0.201696,-0.202704,-0.200028,...,,,,,,,,,,
4,510,413,6,-0.149431,-0.134684,-0.134163,-0.133411,-0.131665,-0.127982,-0.125324,...,,,,,,,,,,


In [4]:
GRF_COP_AP_RAW_left.head()

Unnamed: 0,SUBJECT_ID,SESSION_ID,TRIAL_ID,COP_AP_RAW_1,COP_AP_RAW_2,COP_AP_RAW_3,COP_AP_RAW_4,COP_AP_RAW_5,COP_AP_RAW_6,COP_AP_RAW_7,...,COP_AP_RAW_410,COP_AP_RAW_411,COP_AP_RAW_412,COP_AP_RAW_413,COP_AP_RAW_414,COP_AP_RAW_415,COP_AP_RAW_416,COP_AP_RAW_417,COP_AP_RAW_418,COP_AP_RAW_419
0,510,413,1,-0.17903,-0.18043,-0.181978,-0.176719,-0.17613,-0.17682,-0.174659,...,,,,,,,,,,
1,510,413,2,-0.178717,-0.184542,-0.188789,-0.184976,-0.181117,-0.178235,-0.175591,...,,,,,,,,,,
2,510,413,3,-0.217464,-0.230977,-0.240081,-0.235173,-0.231944,-0.231034,-0.229675,...,,,,,,,,,,
3,510,413,4,-0.236844,-0.216593,-0.214965,-0.212311,-0.209377,-0.206651,-0.203977,...,,,,,,,,,,
4,510,413,6,-0.150634,-0.159325,-0.163416,-0.159186,-0.155433,-0.15265,-0.150756,...,,,,,,,,,,


In [6]:
# combine left and right side so easier to work with
GRF_COP_AP_RAW_left['SIDE'] = 'LEFT'
GRF_COP_AP_RAW_right['SIDE'] = 'RIGHT'

# Concatenate DataFrames
GRF_COP_AP_RAW = pd.concat([GRF_COP_AP_RAW_left, GRF_COP_AP_RAW_right])
GRF_COP_AP_RAW.head()

Unnamed: 0,SUBJECT_ID,SESSION_ID,TRIAL_ID,COP_AP_RAW_1,COP_AP_RAW_2,COP_AP_RAW_3,COP_AP_RAW_4,COP_AP_RAW_5,COP_AP_RAW_6,COP_AP_RAW_7,...,COP_AP_RAW_411,COP_AP_RAW_412,COP_AP_RAW_413,COP_AP_RAW_414,COP_AP_RAW_415,COP_AP_RAW_416,COP_AP_RAW_417,COP_AP_RAW_418,COP_AP_RAW_419,SIDE
0,510,413,1,-0.17903,-0.18043,-0.181978,-0.176719,-0.17613,-0.17682,-0.174659,...,,,,,,,,,,LEFT
1,510,413,2,-0.178717,-0.184542,-0.188789,-0.184976,-0.181117,-0.178235,-0.175591,...,,,,,,,,,,LEFT
2,510,413,3,-0.217464,-0.230977,-0.240081,-0.235173,-0.231944,-0.231034,-0.229675,...,,,,,,,,,,LEFT
3,510,413,4,-0.236844,-0.216593,-0.214965,-0.212311,-0.209377,-0.206651,-0.203977,...,,,,,,,,,,LEFT
4,510,413,6,-0.150634,-0.159325,-0.163416,-0.159186,-0.155433,-0.15265,-0.150756,...,,,,,,,,,,LEFT


### Assignment 1

In [21]:
def get_avg_stride_length(subject_id):
    # filter left and right df by subject id
    condition = GRF_COP_AP_RAW['SUBJECT_ID'] == subject_id
    subject_df = GRF_COP_AP_RAW[condition]
    subject_df.head()

    avg_list = []
    for index, row in subject_df.iterrows():
        start_column = 'COP_AP_RAW_3'
        start_column_index = subject_df.columns.get_loc(start_column)

        avg = abs(row['COP_AP_RAW_1'] - row['COP_AP_RAW_2'])
        prev_val = row['COP_AP_RAW_2']

        for column, value in row.iloc[start_column_index:].iteritems():
            if pd.isna(value):
                    break

            avg += abs(row[column] - prev_val)
            avg /= 2
            pre_val = row[column]

        avg_list.append(avg)

    avg_val = sum(avg_list) / len(avg_list)
    # print('average stride length: ' + str(avg_val))
    return avg_val

# let's find the average stride length for subject 510
print('average stride length: ' + str(get_avg_stride_length(510)))

average stride length: 0.29215193710063286


### Assignment 2

In [22]:
# generalize to subject 408
print('average stride length: ' + str(get_avg_stride_length(408)))

average stride length: 0.2798390712018261


### Assignment 3

In [23]:
GRF_Annotation = pd.read_csv(r'GRF_metadata.csv')

In [None]:
# binary classification between healthy subjects and subjects with foot related injuries

# filter out dataset to only be those subjects
condition = GRF_Annotation['CLASS_LABEL'] == 'HC' or GRF_Annotation['CLASS_LABEL'] == 'C'
filtered_GRF_Annotation = GRF_Annotation[condition]

# merge all data frames

# each row represents one patient



In [None]:
y = master_df['CLASS_LABEL']
X = df.drop('CLASS_LABEL', axis=1)

# split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# convert labels to binary
y_train = [1 if pred == 'C' else 0 for pred in y_train]
y_val = [1 if pred == 'C' else 0 for pred in y_val]
y_test = [1 if pred == 'C' else 0 for pred in y_test]

In [None]:
# needs to be dmatrix in order to pass into xgboost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
params = {
    'objective': 'binary:logistic',  # for binary classification
    'eval_metric': 'logloss', 
    'max_depth': 3,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds, evals=[(dtrain, 'train'), (dval, 'validation')],
                  evals_result=eval_results, early_stopping_rounds=10)

In [None]:
predictions = model.predict(dtest)
# convert probabilities to binary predictions (0 or 1)
binary_predictions = [1 if pred > 0.5 else 0 for pred in predictions]
accuracy = accuracy_score(y_test, binary_predictions)
print(f"Accuracy: {accuracy}")