In [1]:
import lazypredict
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

  from pandas import MultiIndex, Int64Index


In [2]:
def FeatureSelectionModel(X,y,n=50):
    clf = RandomForestClassifier(random_state=0)
    clf.fit(X, y)
    model = SelectFromModel(clf, max_features=n,prefit=True,threshold=-np.inf)
    return model

def FeatureSelection(model,X):
    return model.transform(X)

def SelectX(X_train,y_train,X_test,n=50):
    model = FeatureSelectionModel(X_train,y_train,n)
    X_train = FeatureSelection(model,X_train)
    X_test = FeatureSelection(model,X_test)
    return X_train, X_test

In [3]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
df_X_last = pd.read_csv("data/X_train_features.csv", index_col=0)
df_X_richard = pd.read_csv("data/template_features.csv", index_col=0)
df_X_tim = pd.read_csv("data/full_waveform_features.csv", index_col=0)
df_X_pyHRV = pd.read_csv("data/pyHRV_features.csv", index_col=0)
df_X_hrv_analysis = pd.read_csv("data/hrv-analysis_features.csv", index_col=0)
df_X = pd.concat((df_X_tim,df_X_richard,df_X_pyHRV,df_X_hrv_analysis),axis=1)
# df_X = pd.concat((df_X, df_X_last),axis=1)
df_Y = pd.read_csv("data/Y_train.csv", index_col="id")
y=df_Y.to_numpy().ravel()

In [4]:
df_Y["y0"] = df_Y["y"]%2
df_Y["y1"] = df_Y["y"]//2
df_Y

Unnamed: 0_level_0,y,y0,y1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
1,0,0,0
2,0,0,0
3,1,1,0
4,2,0,1
...,...,...,...
5112,3,1,1
5113,0,0,0
5114,0,0,0
5115,0,0,0


In [5]:
df_X_template_test = pd.read_csv("data/template_features_test.csv", index_col=0)
df_X_tim_test = pd.read_csv("data/full_waveform_features_test.csv", index_col=0)
df_X_pyHRV_test = pd.read_csv("data/pyHRV_features_test.csv", index_col=0)
df_X_hrv_analysis_test = pd.read_csv("data/hrv-analysis_features_test.csv", index_col=0)
df_X_test = pd.concat((df_X_tim_test,df_X_template_test,df_X_pyHRV_test,df_X_hrv_analysis_test), axis=1)

Remove highly correlated features

In [6]:
def rm_corr(X, X_test):
    corr_matrix = X.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    print("Removed columns: ", len(to_drop))
    # Drop features 
    X.drop(to_drop, axis=1, inplace=True)
    X_test.drop(to_drop, axis=1, inplace=True)
    
    return X, X_test

In [7]:
df_X, df_X_test = rm_corr(df_X, df_X_test)

Removed columns:  45


Normalize

In [8]:
df_X.replace([np.inf, -np.inf], np.nan, inplace=True)
df_X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
transformer = RobustScaler()
X = transformer.fit_transform(df_X)
X_test = transformer.transform(df_X_test)

In [9]:
X.shape

(5117, 78)

Impute missing values

In [10]:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X = imp_median.fit_transform(X)
X_test = imp_median.transform(X_test)

In [11]:
{'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 600}

{'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 600}

In [17]:
X, X_test = SelectX(X,df_Y["y"],X_test,n=50)
X_train, X_test_, y_train, y_test_ = train_test_split(X, df_Y["y"],stratify=df_Y["y"],test_size=.2,random_state=0)
clf = LazyClassifier(verbose=0, custom_metric=None)
models,predictions = clf.fit(X_train, X_test_, y_train, y_test_)
models

 90%|████████▉ | 26/29 [00:11<00:00,  3.01it/s]



100%|██████████| 29/29 [00:17<00:00,  1.66it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.84,0.74,,0.84,2.3
QuadraticDiscriminantAnalysis,0.75,0.73,,0.74,0.02
LGBMClassifier,0.83,0.72,,0.83,3.94
LinearDiscriminantAnalysis,0.77,0.68,,0.76,0.04
RandomForestClassifier,0.82,0.67,,0.81,1.85
ExtraTreesClassifier,0.82,0.66,,0.81,0.44
GaussianNB,0.69,0.66,,0.69,0.01
SGDClassifier,0.76,0.66,,0.76,0.1
LogisticRegression,0.79,0.66,,0.79,0.12
Perceptron,0.73,0.65,,0.73,0.03


In [12]:
clf_0 = XGBClassifier(learning_rate=0.1, max_depth=8, n_estimators=600)
X0, X_test0 = SelectX(X,df_Y["y0"],X_test)
clf_0.fit(X0, df_Y["y0"])
y0_pred = clf_0.predict(X_test0)

maskA = (df_Y["y0"]==0)
maskA_test = (y0_pred==0)
X1A = X[maskA]
y1A = df_Y[maskA]["y1"]
X1A_test = X_test[maskA_test]
clf_1A = XGBClassifier(learning_rate=0.1, max_depth=8, n_estimators=600)
X1A, X1A_test = SelectX(X1A,y1A,X1A_test)
clf_1A.fit(X1A, y1A)
y1A_pred = clf_1A.predict(X1A_test)
y1A_pred

maskB = (df_Y["y0"]==1)
maskB_test = (y0_pred==1)
X1B = X[maskB]
y1B = df_Y[maskB]["y1"]
X1B_test = X_test[maskB_test]
clf_1B = XGBClassifier(learning_rate=0.1, max_depth=8, n_estimators=600)
X1B, X1B_test = SelectX(X1B,y1B,X1B_test)
clf_1B.fit(X1B, y1B)
y1B_pred = clf_1B.predict(X1B_test)



In [13]:
y1A_pred.shape
a = np.empty((len(df_X_test),))
a[:]=np.nan
a[maskA_test] = y1A_pred
b = np.empty((len(df_X_test),))
b[:]=np.nan
b[maskB_test] = y1B_pred
b


array([nan, nan, nan, ..., nan, nan,  0.])

In [14]:
y_pred = pd.DataFrame({"y":np.nan,"y0":y0_pred, "y1":-1},index = df_X_test.index)
y_pred["y1"] = y_pred["y1"].where(~maskA_test, other=a)
y_pred["y1"] = y_pred["y1"].where(~maskB_test, other=b)
y_pred["y"] = 2*y_pred["y1"] + y_pred["y0"]
y_pred

Unnamed: 0,y,y0,y1
0,0,0,0
1,0,0,0
2,2,0,1
3,0,0,0
4,0,0,0
...,...,...,...
3406,0,0,0
3407,0,0,0
3408,0,0,0
3409,0,0,0


In [15]:
y_pred = y_pred.drop(columns=["y0","y1"])
y_pred.index.name = 'id'
y_pred

Unnamed: 0_level_0,y
id,Unnamed: 1_level_1
0,0
1,0
2,2
3,0
4,0
...,...
3406,0
3407,0
3408,0
3409,0


In [16]:
y_pred.to_csv("sub_xgb_cascade.csv", index=True)