In [53]:
from sklearn.metrics import f1_score
import lazypredict
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import lightgbm
import xgboost
import sklearn
import catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import IsolationForest
#from pyod.models.ecod import ECOD
from sklearn.model_selection import StratifiedKFold

In [63]:
df_X_last = pd.read_csv("data/X_train_features.csv", index_col=0)
df_X_richard = pd.read_csv("data/template_features_v4.csv", index_col=0)
df_X_tim = pd.read_csv("data/full_waveform_features.csv", index_col=0)
df_X_pyHRV = pd.read_csv("data/pyHRV_features.csv", index_col=0)
df_X_hrv_analysis = pd.read_csv("data/hrv-analysis_features.csv", index_col=0)
df_X = pd.concat((df_X_tim,df_X_richard,df_X_pyHRV,df_X_hrv_analysis),axis=1)
df_Y = pd.read_csv("data/y_train.csv", index_col="id")

Remove highly correlated features

In [64]:
def rm_corr(X):
    corr_matrix = X.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    print("Removed columns: ", len(to_drop))
    # Drop features 
    X.drop(to_drop, axis=1, inplace=True)
    # X_test.drop(to_drop, axis=1, inplace=True)
    
    return X

In [65]:
df_X = rm_corr(df_X)

Removed columns:  52


Normalize

In [66]:
df_X.replace([np.inf, -np.inf], np.nan, inplace=True)
transformer = RobustScaler()
X = transformer.fit_transform(df_X)
# nans are no problem

Impute missing values

In [67]:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X = imp_median.fit_transform(X)

In [68]:
X.shape

(5117, 93)

Simple Outlier Removal

In [311]:
Y = df_Y.to_numpy()
for i in range(3):
    class_i = np.squeeze(Y==i)
    ids = np.where(class_i)[0]
    X_c = X[class_i]
    # preds = IsolationForest(random_state=0, contamination = 0.06).fit_predict(X_c)
    clf = ECOD()
    clf.fit(X_c)
    preds = clf.predict(X_c)
    outlier_ids = ids[preds == 1] # minus 1 for isolation forest
    Y = np.delete(Y, outlier_ids, axis=0)
    X = np.delete(X, outlier_ids, axis=0)
    print(outlier_ids.shape[0])

303
45
148


In [11]:
X.shape

(5117, 93)

In [13]:
X = SelectKBest(f_classif, k=70).fit_transform(X, df_Y)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, df_Y,test_size=.5)
classifiers = [("LGBMClassifier", lightgbm.LGBMClassifier), ("XGBClassifier", xgboost.XGBClassifier), \
               ("RandomForestClassifier", sklearn.ensemble.RandomForestClassifier), \
                ("ExtraTreesClassifier", sklearn.ensemble.ExtraTreesClassifier)]
# catboost classifier takes very long but is similar in performance to these other classifiers
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None, classifiers=classifiers)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|█████████████████████████████████████████████| 4/4 [00:16<00:00,  4.13s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.83,0.74,,0.83,7.58
LGBMClassifier,0.84,0.74,,0.84,5.13
RandomForestClassifier,0.82,0.67,,0.82,2.88
ExtraTreesClassifier,0.81,0.66,,0.81,0.92


In [69]:
Y = df_Y.to_numpy()
last = df_X_last.to_numpy()

In [71]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
f1_last = []
f1_ours = []
for train_index, test_index in skf.split(X, Y):
    clf = xgboost.XGBClassifier()
    
    clf.fit(X[train_index], Y[train_index])
    y_pred = clf.predict(X[test_index])
    f1_ours.append(f1_score(Y[test_index], y_pred, average='micro'))
    
    clf.fit(last[train_index], Y[train_index])
    y_pred = clf.predict(last[test_index])
    f1_last.append(f1_score(Y[test_index], y_pred, average='micro'))

In [72]:
# old feature version
print("Ours | Average: ", np.mean(f1_ours), "Std: ", np.std(f1_ours))
print("Theirs | Average: ", np.mean(f1_last), "Std: ", np.std(f1_last))

Ours | Average:  0.829394626710655 Std:  0.008750962139280984
Theirs | Average:  0.8458077498778105 Std:  0.01066281107735506
