In [76]:
import lazypredict
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from xgboost import XGBClassifier

In [89]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
df_X_last = pd.read_csv("data/X_train_features.csv", index_col=0)
df_X_richard = pd.read_csv("data/template_features.csv", index_col=0)
df_X_tim = pd.read_csv("data/full_waveform_features.csv", index_col=0)
df_X_pyHRV = pd.read_csv("data/pyHRV_features.csv", index_col=0)
df_X_hrv_analysis = pd.read_csv("data/hrv-analysis_features.csv", index_col=0)
df_X = pd.concat((df_X_tim,df_X_richard,df_X_pyHRV,df_X_hrv_analysis),axis=1)
# df_X = pd.concat((df_X, df_X_last),axis=1)
df_Y = pd.read_csv("data/Y_train.csv", index_col="id")

In [90]:
df_X_template_test = pd.read_csv("data/template_features_test.csv", index_col=0)
df_X_tim_test = pd.read_csv("data/full_waveform_features_test.csv", index_col=0)
df_X_pyHRV_test = pd.read_csv("data/pyHRV_features_test.csv", index_col=0)
df_X_hrv_analysis_test = pd.read_csv("data/hrv-analysis_features_test.csv", index_col=0)
df_X_test = pd.concat((df_X_tim_test,df_X_template_test,df_X_pyHRV_test,df_X_hrv_analysis_test), axis=1)

Remove highly correlated features

In [91]:
def rm_corr(X, X_test):
    corr_matrix = X.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    print("Removed columns: ", len(to_drop))
    # Drop features 
    X.drop(to_drop, axis=1, inplace=True)
    X_test.drop(to_drop, axis=1, inplace=True)
    
    return X, X_test

In [92]:
df_X, df_X_test = rm_corr(df_X, df_X_test)

Removed columns:  45


Normalize

In [93]:
df_X.replace([np.inf, -np.inf], np.nan, inplace=True)
df_X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
transformer = RobustScaler()
X = transformer.fit_transform(df_X)
X_test = transformer.transform(df_X_test)

In [94]:
X.shape

(5117, 78)

Impute missing values

In [95]:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X = imp_median.fit_transform(X)
X_test = imp_median.transform(X_test)

In [96]:
X_train, X_test_, y_train, y_test_ = train_test_split(X, df_Y,stratify=df_Y,test_size=.2,random_state=0)
clf = LazyClassifier(verbose=0, custom_metric=None)
models,predictions = clf.fit(X_train, X_test_, y_train, y_test_)
models

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:45<00:00,  1.57s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.83,0.73,,0.83,7.07
LGBMClassifier,0.84,0.72,,0.83,2.46
Perceptron,0.75,0.7,,0.75,0.08
SGDClassifier,0.77,0.69,,0.76,0.29
LogisticRegression,0.8,0.69,,0.8,0.22
LinearDiscriminantAnalysis,0.78,0.68,,0.77,0.12
RandomForestClassifier,0.82,0.68,,0.82,3.61
BaggingClassifier,0.8,0.67,,0.79,3.79
NearestCentroid,0.66,0.66,,0.66,0.04
GaussianNB,0.69,0.66,,0.69,0.04


In [97]:
X_test.shape

(3411, 78)

In [98]:
clf = XGBClassifier()
clf.fit(X, df_Y)
y_pred = clf.predict(X_test)

In [99]:
ids = np.arange(y_pred.shape[0])

In [100]:
pd.DataFrame({"id": ids, "y": y_pred}).to_csv("sub_xgb.csv", index=False)