In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

filepath = "path.xlsx"
df = pd.read_excel(filepath, index_col=0)
df.tail()

Unnamed: 0_level_0,Grupo,Sexo,Idade,Massa,Estatura,IMC,Gait_Speed_TS_T0,Gait_Speed_TS_T3,Gait_Speed_TS_T6,Gait_Speed_TS_T9,...,StroopColor_ortost_T0,StroopColor_ortost_T3,StroopColor_ortost_T6,StroopColor_ortost_T9,StroopColor_ortost_T12,StroopColorEffectOrt_T0,StroopColorEffectOrt_T3,StroopColorEffectOrt_T6,StroopColorEffectOrt_T9,StroopColorEffectOrt_T12
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s56,GE,F,70,79.0,1.54,33.310845,1.13,,,,...,87.8,,,,,-54.3058,,,,
s57,GC,M,78,69.4,1.69,24.298869,1.59,,,,...,202.6,,,,,-342.358079,,,,
s58,GC,F,71,75.0,1.59,29.666548,1.27,1.15,,1.15,...,53.6,53.3,42.3,49.4,,-109.375,-125.847458,-81.545064,-94.488189,
s59,GC,F,73,55.5,1.58,22.232014,1.25,,1.47,1.32,...,103.5,,149.8,142.1,95.0,-175.265957,,-255.819477,-299.157303,-157.452575
s60,GC,F,65,59.4,1.46,27.866391,1.26,,1.53,1.36,...,64.5,,59.4,52.1,50.0,-113.576159,,-102.730375,-84.09894,-76.056338


In [2]:
list(df['Grupo'].unique())

['GC', 'GE']

In [3]:
df.loc[df['Grupo'] == 'GC', 'Label'] = int(1)
df.loc[df['Grupo'] == 'GE', 'Label'] = int(0)
df['Label'] = df['Label'].astype('int64')

In [4]:
df['Label'].head()

ID
s01    1
s02    1
s03    0
s04    1
s05    0
Name: Label, dtype: int64

In [65]:
from sklearn.model_selection import cross_val_score
from skrebate import ReliefF
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


def generate_experiment(df, classifier):
    features, labels = df.drop(['Label'], axis=1).astype('float64').values, df['Label'].astype('int64').values
    clf = make_pipeline(
        *[ReliefF(n_features_to_select=10, n_neighbors=100, verbose=False, n_jobs=8), classifier],
        verbose=False,
    )
    clf.fit(features, labels)

    scores = cross_val_score(clf, features, labels)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    scores = cross_val_score(clf, features, labels, scoring='f1_micro')
    print("F1 Micro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    scores = cross_val_score(clf, features, labels, scoring='precision')
    print("Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    scores = cross_val_score(clf, features, labels, scoring='recall')
    print("Recall: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    scores = cross_val_score(clf, features, labels, scoring='roc_auc')
    print("ROC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    relief = clf.get_params()['steps'][0][1]
    dfcolumns = pd.DataFrame(df.drop(['Label'], axis=1).columns)
    dfscores = pd.DataFrame(relief.feature_importances_)

    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Features','Score']
    df_features_scores = featureScores.sort_values("Score", ascending=False)
    return df_features_scores[:10]

# Experiments
- The goal is to find the features that most represent the final evaluation state
- Use T0, T3, T6, T9 to predict the Group (GC or GE)
    
# Preprocessing
#### Option 1
- [OK] Remove patients with NaNs on T12 evaluations 
    - **17 subjects** by considering only subjects that have all the measures.

#### Option 2
- [TODO] Replace NaNs by an average of the previous and posterior evaluation moment
    - **31 subjects** by considering subjects that may have missed evaluations but have the T12.
    - **42 subjects** by considering subjects that may have missed evaluations and predicting the T9 state instead of T12

# Experiment #1
- Using only measurements T0 and T2 to predict the group (GC or GE)

In [9]:
drop_cols = [col for col in df.columns if "T3" in col or "T6" in col or "T9" in col]
T0_T12_df = df.drop(labels=drop_cols, axis=1)
T0_T12_df = T0_T12_df.drop(labels=['Grupo', 'Sexo'], axis=1)
T0_T12_df = T0_T12_df.dropna()

In [66]:
generate_experiment(T0_T12_df, classifier=RandomForestClassifier(n_estimators=1000))

Accuracy: 0.40 (+/- 0.53)
F1 Micro: 0.49 (+/- 0.41)
Precision: 0.53 (+/- 0.49)
Recall: 0.47 (+/- 0.13)
ROC: 0.57 (+/- 0.69)


Unnamed: 0,Features,Score
108,BIOF_T0,0.297521
112,CTSIB_Total_T0,0.264463
135,SF36_Pain_T12,0.206612
9,Gait_Assimetry_TS_T12,0.186931
123,GDS_T12,0.173554
61,Gait_Assimetry_DTPF_T12,0.151534
114,STS_5X_T0,0.143054
149,StroopColor_T12,0.132894
35,Gait_Assimetry_DTPV_T12,0.126865
92,EERE_T0,0.123967


In [67]:
generate_experiment(T0_T12_df, classifier=RandomForestClassifier(n_estimators=100))

Accuracy: 0.49 (+/- 0.41)
F1 Micro: 0.49 (+/- 0.41)
Precision: 0.63 (+/- 0.61)
Recall: 0.57 (+/- 0.45)
ROC: 0.55 (+/- 0.66)


Unnamed: 0,Features,Score
108,BIOF_T0,0.297521
112,CTSIB_Total_T0,0.264463
135,SF36_Pain_T12,0.206612
9,Gait_Assimetry_TS_T12,0.186931
123,GDS_T12,0.173554
61,Gait_Assimetry_DTPF_T12,0.151534
114,STS_5X_T0,0.143054
149,StroopColor_T12,0.132894
35,Gait_Assimetry_DTPV_T12,0.126865
92,EERE_T0,0.123967


In [68]:
generate_experiment(T0_T12_df, classifier=SVC(kernel='linear'))

Accuracy: 0.64 (+/- 0.55)
F1 Micro: 0.64 (+/- 0.55)
Precision: 0.70 (+/- 0.53)
Recall: 0.67 (+/- 0.56)
ROC: 0.65 (+/- 0.60)


Unnamed: 0,Features,Score
108,BIOF_T0,0.297521
112,CTSIB_Total_T0,0.264463
135,SF36_Pain_T12,0.206612
9,Gait_Assimetry_TS_T12,0.186931
123,GDS_T12,0.173554
61,Gait_Assimetry_DTPF_T12,0.151534
114,STS_5X_T0,0.143054
149,StroopColor_T12,0.132894
35,Gait_Assimetry_DTPV_T12,0.126865
92,EERE_T0,0.123967


In [70]:
generate_experiment(T0_T12_df, classifier=SVC(kernel='poly'))

Accuracy: 0.46 (+/- 0.61)
F1 Micro: 0.46 (+/- 0.61)
Precision: 0.52 (+/- 0.82)
Recall: 0.47 (+/- 0.65)
ROC: 0.52 (+/- 0.73)


Unnamed: 0,Features,Score
108,BIOF_T0,0.297521
112,CTSIB_Total_T0,0.264463
135,SF36_Pain_T12,0.206612
9,Gait_Assimetry_TS_T12,0.186931
123,GDS_T12,0.173554
61,Gait_Assimetry_DTPF_T12,0.151534
114,STS_5X_T0,0.143054
149,StroopColor_T12,0.132894
35,Gait_Assimetry_DTPV_T12,0.126865
92,EERE_T0,0.123967


# Experiment 2
- Using only measurements from T0 to predict the group (GC or GE)

In [71]:
drop_cols = [col for col in df.columns if "T3" in col or "T6" in col or "T9" in col or "T12" in col]
T0_df = df.drop(labels=drop_cols, axis=1)
T0_df = T0_df.drop(labels=['Grupo', 'Sexo'], axis=1)
T0_df = T0_df.dropna()

In [72]:
print(len(T0_df))

52


In [73]:
generate_experiment(T0_df, classifier=SVC(kernel='poly'))

Accuracy: 0.39 (+/- 0.15)
F1 Micro: 0.39 (+/- 0.15)
Precision: 0.30 (+/- 0.36)
Recall: 0.39 (+/- 0.68)
ROC: 0.23 (+/- 0.33)


Unnamed: 0,Features,Score
56,BIOF_T0,0.099112
58,CTSIB_Total_T0,0.081361
15,SwingSpeed_D_TS_T0,0.054231
64,SF36_PF_T0,0.042792
19,Gait_Assimetry_DTPV_T0,0.039463
52,BEOA_T0,0.036982
28,SwingSpeed_D_DTPV_T0,0.035598
71,SF36_GHP_T0,0.032914
14,PeakAngVelocity_D_TS_T0,0.031757
29,MinToe_D_DTPV_T0,0.031065


In [75]:
generate_experiment(T0_df, classifier=SVC(kernel='linear'))

Accuracy: 0.52 (+/- 0.27)
F1 Micro: 0.52 (+/- 0.27)
Precision: 0.45 (+/- 0.46)
Recall: 0.51 (+/- 0.61)
ROC: 0.58 (+/- 0.38)


Unnamed: 0,Features,Score
56,BIOF_T0,0.099112
58,CTSIB_Total_T0,0.081361
15,SwingSpeed_D_TS_T0,0.054231
64,SF36_PF_T0,0.042792
19,Gait_Assimetry_DTPV_T0,0.039463
52,BEOA_T0,0.036982
28,SwingSpeed_D_DTPV_T0,0.035598
71,SF36_GHP_T0,0.032914
14,PeakAngVelocity_D_TS_T0,0.031757
29,MinToe_D_DTPV_T0,0.031065


In [76]:
generate_experiment(T0_df, classifier=RandomForestClassifier(n_estimators=1000))

Accuracy: 0.49 (+/- 0.24)
F1 Micro: 0.50 (+/- 0.16)
Precision: 0.47 (+/- 0.13)
Recall: 0.50 (+/- 0.40)
ROC: 0.48 (+/- 0.28)


Unnamed: 0,Features,Score
56,BIOF_T0,0.099112
58,CTSIB_Total_T0,0.081361
15,SwingSpeed_D_TS_T0,0.054231
64,SF36_PF_T0,0.042792
19,Gait_Assimetry_DTPV_T0,0.039463
52,BEOA_T0,0.036982
28,SwingSpeed_D_DTPV_T0,0.035598
71,SF36_GHP_T0,0.032914
14,PeakAngVelocity_D_TS_T0,0.031757
29,MinToe_D_DTPV_T0,0.031065


In [77]:
generate_experiment(T0_df, classifier=RandomForestClassifier(n_estimators=100))

Accuracy: 0.50 (+/- 0.33)
F1 Micro: 0.44 (+/- 0.11)
Precision: 0.50 (+/- 0.35)
Recall: 0.50 (+/- 0.54)
ROC: 0.48 (+/- 0.21)


Unnamed: 0,Features,Score
56,BIOF_T0,0.099112
58,CTSIB_Total_T0,0.081361
15,SwingSpeed_D_TS_T0,0.054231
64,SF36_PF_T0,0.042792
19,Gait_Assimetry_DTPV_T0,0.039463
52,BEOA_T0,0.036982
28,SwingSpeed_D_DTPV_T0,0.035598
71,SF36_GHP_T0,0.032914
14,PeakAngVelocity_D_TS_T0,0.031757
29,MinToe_D_DTPV_T0,0.031065


In [78]:
generate_experiment(T0_df, classifier=RandomForestClassifier(n_estimators=50))

Accuracy: 0.41 (+/- 0.30)
F1 Micro: 0.48 (+/- 0.22)
Precision: 0.45 (+/- 0.13)
Recall: 0.57 (+/- 0.47)
ROC: 0.39 (+/- 0.22)


Unnamed: 0,Features,Score
56,BIOF_T0,0.099112
58,CTSIB_Total_T0,0.081361
15,SwingSpeed_D_TS_T0,0.054231
64,SF36_PF_T0,0.042792
19,Gait_Assimetry_DTPV_T0,0.039463
52,BEOA_T0,0.036982
28,SwingSpeed_D_DTPV_T0,0.035598
71,SF36_GHP_T0,0.032914
14,PeakAngVelocity_D_TS_T0,0.031757
29,MinToe_D_DTPV_T0,0.031065


# Experiment #3 
- Using all subjects that went to all measurements to predict Group

In [79]:
all_Ts = df.drop(labels=['Grupo', 'Sexo'], axis=1)
all_Ts = all_Ts.dropna()
print(len(all_Ts))

17


In [81]:
generate_experiment(all_Ts, classifier=SVC(kernel='poly'))

Accuracy: 0.63 (+/- 0.67)
F1 Micro: 0.63 (+/- 0.67)
Precision: 0.70 (+/- 0.80)
Recall: 0.60 (+/- 0.75)
ROC: 0.75 (+/- 0.77)


Unnamed: 0,Features,Score
342,SF36_Pain_T12,0.353758
97,StrideLength_D_DTPV_T9,0.30832
274,CTSIB_Total_T0,0.283497
303,GDS_T12,0.26634
337,SF36_VEF_T9,0.264706
255,BEDO_T3,0.264706
163,StrideLength_D_DTPF_T12,0.242599
335,SF36_GMH_T9,0.235294
225,EERE_T3,0.228758
315,SF36_Pain_T3,0.223856
