In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, f1_score, recall_score, confusion_matrix


In [None]:
# split train and test set
ds = pd.read_csv('data/alldata.csv')
print(ds.shape)
pd.set_option('display.max_columns',None)
print(ds['Whether the critically ill'].value_counts())
ds.head()

In [None]:
X = ds.iloc[:, :-1]
y = ds.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
X_train.insert(X_train.shape[1], y_train.name, y_train)
X_test.insert(X_test.shape[1], y_test.name, y_test)
X_train.to_csv('data/train.csv', index=False)
X_test.to_csv('data/test.csv', index=False)

**7 machine learning model**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

In [None]:
def cross_validation(x, y, model_class, model_paras, kfold, random_seed, outfile):
    skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=random_seed)
    outdict = {'acc':[], 'auc':[], 'recall_0':[], 'recall_1':[], 'f1':[]}
    for i, (train_idx, test_idx) in enumerate(skf.split(x, y)):
        print(f'**** Fold {i + 1} ****')
        train_x = x[train_idx, :]
        train_y = y[train_idx]
        test_x = x[test_idx, :]
        test_y = y[test_idx]
        model = model_class(**model_paras)
        model.fit(train_x, train_y)
        y_pred = model.predict(test_x)

        acc = accuracy_score(test_y, y_pred)
        recall = recall_score(test_y, y_pred, labels=None, pos_label=1, average=None, sample_weight=None, zero_division='warn')
        f1 = f1_score(test_y, y_pred, average='macro')
        auc = roc_auc_score(test_y, y_pred, multi_class='ovr')

        outdict['acc'].append(acc)
        outdict['auc'].append(auc)
        outdict['recall_0'].append(recall[0])
        outdict['recall_1'].append(recall[1])
        outdict['f1'].append(f1)

        print('acc   ', acc)
        print('recall', recall)
        print('f1    ', f1)
        print('auc   ', auc)
        print()
    outdf = pd.DataFrame(outdict)
    outdf.to_csv(outfile, index=False)

In [None]:
df = pd.read_csv('data/train.csv').to_numpy()
x = df[:, 1:-1]    # remove idx column and label column
y = df[:, -1]

In [None]:
# Logistic Regression
model_class = LogisticRegression
model_paras = {'random_state':0, 'max_iter':200}
outfile = 'output_ML/lr.cross_validation.csv'
cross_validation(x, y,
                 model_class, 
                 model_paras,
                 kfold=5,
                 random_seed=1,
                 outfile=outfile)

In [None]:
# KNN
model_class = KNeighborsClassifier
model_paras = {'n_neighbors':4, 'algorithm':'auto', 'metric':'minkowski'}
outfile = 'output_ML/knn.cross_validation.csv'
cross_validation(x, y,
                 model_class, 
                 model_paras,
                 kfold=5,
                 random_seed=1,
                 outfile=outfile)

In [None]:
# SVM
model_class = SVC
model_paras = {'kernel':'linear', 'degree':2, 'random_state':0, 'probability':True}
outfile = 'output_ML/svm.cross_validation.csv'
cross_validation(x, y,
                 model_class, 
                 model_paras,
                 kfold=5,
                 random_seed=1,
                 outfile=outfile)

In [None]:
# Kernel SVM
model_class = SVC
model_paras = {'kernel':'rbf', 'degree':2, 'random_state':0, 'probability':True}
outfile = 'output_ML/kernel_svm.cross_validation.csv'
cross_validation(x, y,
                 model_class, 
                 model_paras,
                 kfold=5,
                 random_seed=1,
                 outfile=outfile)

In [None]:
# Decision Tree Classifier
model_class = DecisionTreeClassifier
model_paras = {'criterion':'entropy', 'random_state':0}
outfile = 'output_ML/dt.cross_validation.csv'
cross_validation(x, y,
                 model_class, 
                 model_paras,
                 kfold=5,
                 random_seed=1,
                 outfile=outfile)

In [None]:
# RF
model_class = RandomForestClassifier
model_paras = {'n_estimators':1000, 'criterion':'entropy', 'random_state':0}
outfile = 'output_ML/rf.cross_validation.csv'
cross_validation(x, y,
                 model_class, 
                 model_paras,
                 kfold=5,
                 random_seed=1,
                 outfile=outfile)

In [None]:
# XGBoost
model_class = XGBClassifier
model_paras = {'n_estimators':500, 'learning_rate':0.01, 'max_depth':3}
outfile = 'output_ML/xgb.cross_validation.csv'
cross_validation(x, y,
                 model_class, 
                 model_paras,
                 kfold=5,
                 random_seed=1,
                 outfile=outfile)

**ANN model**

In [None]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import metrics, initializers
from keras.layers import Dense, Dropout, Activation, BatchNormalization

In [None]:
def build_model(in_dim):
    model = Sequential()
    model.add(Dense(units = 64, activation='relu', input_shape = [in_dim]))
    model.add(Dense(units = 128, activation='relu'))
    model.add(Dense(units = 64, activation='relu'))
    model.add(Dense(units = 1, activation='sigmoid'))
      
    return model

In [None]:
# 5-fold cross validation
traindf = pd.read_csv('data/train.csv').to_numpy().astype('float32')
x = traindf[:, 1:-1]  # remove idx column and label column
y = traindf[:, -1].reshape(-1)

in_dim = x.shape[1]
lr = 0.0001
early_stopping = 10
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)


for i, (train_idx, test_idx) in enumerate(skf.split(x, y)):
    print(f'**** Fold {i + 1} ****')
    model = build_model(in_dim)
    optimizers = tf.keras.optimizers.Adam(learning_rate= lr)
    model.compile(loss = 'binary_crossentropy', optimizer = optimizers, metrics = ['accuracy', metrics.AUC(name='AUC'), metrics.Precision(name='presicion'), metrics.Recall(name='recall'), metrics.F1Score(threshold=0.5, name='f1score', average='macro')])
    early = EarlyStopping(monitor = 'accuracy', mode = 'max', patience = early_stopping)  # loss 连续10个循环不下降则停止训练
    train_x = x[train_idx, :]
    train_y = y[train_idx]
    test_x = x[test_idx, :]
    test_y = y[test_idx]
    results = model.fit(train_x, 
                        train_y, 
                        epochs = 600,
                        batch_size = 16, 
                        validation_data = (test_x, test_y), 
                        shuffle=True, 
                        verbose=0,
                        callbacks = [early])
    outdf = pd.DataFrame(results.history).iloc[:-early_stopping, :]
    outfile = f'output/5fold_cross_validation.fold_{i+1}.csv'
    outdf.to_csv(outfile, index=0)

In [None]:
# cross validation results
df_list = []
for i in range(1,6):
    df = pd.read_csv(f'output/5fold_cross_validation.fold_{i}.csv')
    df = df.iloc[-2:-1, :]
    df.index = [i]
    df_list.append(df)
mergedf = pd.concat(df_list)
mergedf.to_csv('output/ANN.cross_validation.csv', index=0)

In [None]:
# train on training set and test on testing set
traindata = pd.read_csv('data/train.csv').to_numpy().astype('float32')
testdata = pd.read_csv('data/test.csv').to_numpy().astype('float32')
train_x = traindata[:, 1:-1]
train_y = traindata[:, -1]
test_x = testdata[:, 1:-1]
test_y = testdata[:, -1]

in_dim = train_x.shape[1]
lr = 0.0001
training_epochs = 150
model_path = 'model_ckpt/best_model.ckpt'
checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1, save_weights_only=True)
model = build_model(in_dim)
optimizers = tf.keras.optimizers.Adam(learning_rate= lr)
model.compile(loss = 'binary_crossentropy', optimizer = optimizers, metrics = ['accuracy', metrics.AUC(name='AUC'), metrics.Precision(name='presicion'), metrics.Recall(name='recall'), metrics.F1Score(threshold=0.5, name='f1score', average='macro')])
results = model.fit(train_x,
                    train_y,
                    epochs=training_epochs,
                    batch_size=64,
                    validation_data=(test_x, test_y),
                    verbose=0,
                    callbacks=[checkpoint]
                    )
outdf = pd.DataFrame(results.history)