In [1]:
! pip install -q xgboost

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import SVR
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix



In [3]:
# Load the dataset 
train_df = pd.read_csv('processed_data.csv')

In [4]:
# get features and labels out from the dataset as X and y
columns_to_keep = [col for col in train_df.columns if col not in ['PatientID', 'Outcome', 'LOS']]
X = train_df[columns_to_keep]
y_classification = train_df['Outcome']
y_regression = train_df['LOS']

#### Decision Tree 

In [5]:
cross_validation = KFold(n_splits=10, shuffle=True, random_state=1)

def run_dt(f, outcome, los, task, seed, max_depth):
    if task == 'classification': 
        accuracy_scores = []
        for i, j in cross_validation.split(f, outcome):
            f_train, f_test = f.iloc[i], f.iloc[j]
            outcome_train, outcome_test = outcome.iloc[i], outcome.iloc[j]
            DT_model = DecisionTreeClassifier(random_state=seed, max_depth=max_depth)
            DT_model.fit(f_train, outcome_train)
            outcome_pred = DT_model.predict(f_test)
            accuracy = accuracy_score(outcome_test, outcome_pred)
            accuracy_scores.append(accuracy)
        return f' {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}'
    elif task == 'regression':
        mse_scores = []
        for i, j in cross_validation.split(f, los):
            f_train, f_test = f.iloc[i], f.iloc[j]
            los_train, los_test = los.iloc[i], los.iloc[j]
            DT_model = DecisionTreeRegressor(random_state=seed, max_depth=max_depth)
            DT_model.fit(f_train, los_train)
            los_pred = DT_model.predict(f_test)
            mse = mean_squared_error(los_test, los_pred)
            mse_scores.append(mse)
        return f' {np.mean(mse_scores):.4f} ± {np.std(mse_scores):.4f}'
            

#### Random Forest

In [6]:
cross_validation = KFold(n_splits=10, shuffle=True, random_state=1)

def run_rf(f, outcome, los, task, estimators, seed, max_depth):
    if task == 'classification': 
        accuracy_scores = []
        for i, j in cross_validation.split(f, outcome):
            f_train, f_test = f.iloc[i], f.iloc[j]
            outcome_train, outcome_test = outcome.iloc[i], outcome.iloc[j]
            rf_model = RandomForestClassifier(n_estimators= estimators,random_state=seed, max_depth=max_depth)
            rf_model.fit(f_train, outcome_train)
            outcome_pred = rf_model.predict(f_test)
            accuracy = accuracy_score(outcome_test, outcome_pred)
            accuracy_scores.append(accuracy)
        return f' {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}'
    elif task == 'regression':
        mse_scores = []
        for i, j in cross_validation.split(f, los):
            f_train, f_test = f.iloc[i], f.iloc[j]
            los_train, los_test = los.iloc[i], los.iloc[j]
            rf_model = RandomForestRegressor(n_estimators= estimators,random_state=seed, max_depth=max_depth)
            rf_model.fit(f_train, los_train)
            los_pred = rf_model.predict(f_test)
            mse = mean_squared_error(los_test, los_pred)
            mse_scores.append(mse)
        return f' {np.mean(mse_scores):.4f} ± {np.std(mse_scores):.4f}'
            

#### xgboost

In [7]:
cross_validation = KFold(n_splits=10, shuffle=True, random_state=1)

def run_xgboost(f, outcome, los, task, booster, estimators, max_depth):
    if task == 'classification': 
        accuracy_scores = []
        for i, j in cross_validation.split(f, outcome):
            f_train, f_test = f.iloc[i], f.iloc[j]
            outcome_train, outcome_test = outcome.iloc[i], outcome.iloc[j]
            xgb_model = xgb.XGBClassifier(booster= booster, n_estimators= estimators, max_depth=max_depth)
            xgb_model.fit(f_train, outcome_train)
            outcome_pred = xgb_model.predict(f_test)
            accuracy = accuracy_score(outcome_test, outcome_pred)
            accuracy_scores.append(accuracy)
        return f' {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}'
    elif task == 'regression':
        mse_scores = []
        for i, j in cross_validation.split(f, los):
            f_train, f_test = f.iloc[i], f.iloc[j]
            los_train, los_test = los.iloc[i], los.iloc[j]
            xgb_model = xgb.XGBRegressor(booster= booster, n_estimators= estimators, max_depth=max_depth)
            xgb_model.fit(f_train, los_train)
            los_pred = xgb_model.predict(f_test)
            mse = mean_squared_error(los_test, los_pred)
            mse_scores.append(mse)
        return f' {np.mean(mse_scores):.4f} ± {np.std(mse_scores):.4f}'

#### SVM

In [8]:
cross_validation = KFold(n_splits=10, shuffle=True, random_state=1)

def run_svm(f, outcome, los, task, k, c, g):
    if task == 'classification': 
        accuracy_scores = []
        for i, j in cross_validation.split(f, outcome):
            f_train, f_test = f.iloc[i], f.iloc[j]
            outcome_train, outcome_test = outcome.iloc[i], outcome.iloc[j]
            svc_model = SVC(kernel=k, C=c, gamma=g)
            svc_model.fit(f_train, outcome_train)
            outcome_pred = svc_model.predict(f_test)
            accuracy = accuracy_score(outcome_test, outcome_pred)  # Use accuracy_score function
            accuracy_scores.append(accuracy)
        return f' {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}'
    elif task == 'regression':
        mse_scores = []
        for i, j in cross_validation.split(f, los):
            f_train, f_test = f.iloc[i], f.iloc[j]
            los_train, los_test = los.iloc[i], los.iloc[j]
            svr_model = SVR(kernel=k, C=c, gamma=g)
            svr_model.fit(f_train, los_train)
            los_pred = svr_model.predict(f_test)
            mse = mean_squared_error(los_test, los_pred)
            mse_scores.append(mse)
        return f' {np.mean(mse_scores):.4f} ± {np.std(mse_scores):.4f}'




In [9]:
def main():
    # dt
    dt_mse_combination1 = run_dt(X, y_classification, y_regression, 'regression', 42, 5)
    dt_mse_combination2 = run_dt(X, y_classification, y_regression, 'regression', 42, 10)

    # rf
    rf_mse_combination1 = run_rf(X, y_classification, y_regression, 'regression', 50, 42, 5)
    rf_mse_combination2 = run_rf(X, y_classification, y_regression, 'regression', 50, 42, 10)
    
    # xgboost
    xgboost_mse_combination1 = run_xgboost(X, y_classification, y_regression, 'regression', 'gbtree', 42, 5)
    xgboost_mse_combination2 = run_xgboost(X, y_classification, y_regression, 'regression', 'dart', 42, 10)
    
    # svm
    svm_mse_combination1 = run_svm(X, y_classification, y_regression, 'regression', 'rbf', 0.1, 'scale')
    svm_mse_combination2 = run_svm(X, y_classification, y_regression, 'regression', 'rbf', 1.0, 'auto')

    # Results for LOS prediction
    print('LOS Prediction:')
    print("1. Decision Tree: (random_state: 42, max_depth: 5), {}\n \t \t (random_state: 42, max_depth: 10), {}".format(dt_mse_combination1, dt_mse_combination2))
    print("2. Random Forest: (n_estimators: 50, random_state: 42, max_depth: 5), {}\n \t \t (n_estimators: 50, random_state: 42, max_depth: 10), {}".format(rf_mse_combination1, rf_mse_combination2))
    print("3. XgBoost: (booster: gbtree, random_state: 42, max_depth: 5), {}\n \t \t (booster: dart, random_state: 42, max_depth: 10), {}".format(xgboost_mse_combination1, xgboost_mse_combination2))
    print("4. SVM: (kernel: rbf, C: 0.1, gamma: scale), {}\n \t \t (kernel: rbf, C: 1.0, gamma: auto), {}".format(svm_mse_combination1, svm_mse_combination2))

    print(" ")
    print(" ")
    # dt
    dt_acc_combination1 = run_dt(X, y_classification, y_regression, 'classification', 42, 5)
    dt_acc_combination2 = run_dt(X, y_classification, y_regression, 'classification', 42, 10)

    # rf
    rf_acc_combination1 = run_rf(X, y_classification, y_regression, 'classification', 50, 42, 5)
    rf_acc_combination2 = run_rf(X, y_classification, y_regression, 'classification', 50, 42, 10)
    
    # xgboost
    xgboost_acc_combination1 = run_xgboost(X, y_classification, y_regression, 'classification', 'gbtree', 42, 5)
    xgboost_acc_combination2 = run_xgboost(X, y_classification, y_regression, 'classification', 'dart', 42, 10)
    
    # svm
    svm_acc_combination1 = run_svm(X, y_classification, y_regression, 'classification', 'rbf', 0.1, 'scale')
    svm_acc_combination2 = run_svm(X, y_classification, y_regression, 'classification', 'rbf', 1.0, 'auto')

    # Results for Outcome Prediction
    print('Outcome  Prediction:')
    print("1. Decision Tree: (random_state: 42, max_depth: 5), {}\n \t \t (random_state: 42, max_depth: 10), {}".format(dt_acc_combination1, dt_acc_combination2))
    print("2. Random Forest: (n_estimators: 50, random_state: 42, max_depth: 5), {}\n \t \t (n_estimators: 50, random_state: 42, max_depth: 10), {}".format(rf_acc_combination1, rf_acc_combination2))
    print("3. XgBoost: (booster: gbtree, random_state: 42, max_depth: 5), {}\n \t \t (booster: dart, random_state: 42, max_depth: 10), {}".format(xgboost_acc_combination1, xgboost_acc_combination2))
    print("4. SVM: (kernel: rbf, C: 0.1, gamma: scale), {}\n \t \t (kernel: rbf, C: 1.0, gamma: auto), {}".format(svm_acc_combination1, svm_acc_combination2))

if __name__ == "__main__":
    main()

LOS Prediction:
1. Decision Tree: (random_state: 42, max_depth: 5),  52.2216 ± 20.9596
 	 	 (random_state: 42, max_depth: 10),  62.3406 ± 21.6673
2. Random Forest: (n_estimators: 50, random_state: 42, max_depth: 5),  40.3391 ± 17.6600
 	 	 (n_estimators: 50, random_state: 42, max_depth: 10),  40.1438 ± 17.3493
3. XgBoost: (booster: gbtree, random_state: 42, max_depth: 5),  42.2067 ± 13.7943
 	 	 (booster: dart, random_state: 42, max_depth: 10),  44.0573 ± 18.4344
4. SVM: (kernel: rbf, C: 0.1, gamma: scale),  43.9145 ± 19.2166
 	 	 (kernel: rbf, C: 1.0, gamma: auto),  48.5667 ± 17.6198
 
 
Outcome  Prediction:
1. Decision Tree: (random_state: 42, max_depth: 5),  0.9094 ± 0.0662
 	 	 (random_state: 42, max_depth: 10),  0.9004 ± 0.0801
2. Random Forest: (n_estimators: 50, random_state: 42, max_depth: 5),  0.9640 ± 0.0345
 	 	 (n_estimators: 50, random_state: 42, max_depth: 10),  0.9670 ± 0.0409
3. XgBoost: (booster: gbtree, random_state: 42, max_depth: 5),  0.9730 ± 0.0365
 	 	 (booster: 

In [10]:
test_df = pd.read_csv('test.csv')
test_df = test_df[train_df.columns]
test_df

Unnamed: 0,PatientID,Age,Sex,Outcome,hemoglobin,Serum chloride,Prothrombin time,eosinophils(%),Alkaline phosphatase,albumin,...,γ-glutamyl transpeptidase,International standard ratio,basophil count(#),mean corpuscular hemoglobin,serum sodium,thrombocytocrit,glutamic-pyruvic transaminase,eGFR,creatinine,LOS
0,374.0,33.0,1.0,1.0,119.0,128.2,23.25,0.0,143.0,27.4,...,176.0,2.105,0.035,30.15,164.7,0.13,1508.0,69.4,118.0,4.0
1,21.0,39.0,1.0,0.0,137.0,101.25,13.0,0.75,50.5,38.65,...,24.5,0.98,0.0,30.35,137.35,0.27,26.5,106.2,80.0,15.0
2,281.0,81.0,0.0,1.0,120.0,98.2,16.1,0.0,126.0,24.3,...,76.0,1.3,0.02,32.0,139.8,0.09,23.0,58.0,82.0,3.0
3,11.0,32.0,1.0,0.0,150.0,104.25,13.3,0.1,47.5,41.05,...,17.5,1.01,0.0,31.85,139.75,0.17,12.5,117.2,72.5,13.0
4,276.0,73.0,1.0,1.0,148.0,97.9,16.2,0.6,60.0,31.4,...,32.0,1.3,0.02,31.2,136.1,0.22,33.0,81.4,82.0,2.0
5,223.0,82.0,1.0,1.0,165.0,101.5,14.8,0.05,61.5,27.5,...,34.5,1.14,0.015,30.0,143.7,0.265,21.5,55.45,107.5,7.0
6,304.0,67.0,1.0,1.0,117.0,113.4,16.6,0.0,80.0,25.2,...,12.0,1.33,0.02,30.65,156.0,0.11,15.0,69.3,97.0,25.0
7,162.0,57.0,1.0,0.0,163.0,101.8,13.4,1.2,66.0,41.9,...,36.0,1.02,0.03,30.6,138.5,0.26,56.0,86.0,86.0,21.0
8,75.0,88.0,0.0,0.0,110.0,102.6,14.7,0.4,46.0,32.1,...,12.0,1.13,0.02,31.9,142.9,0.21,10.0,79.3,58.0,7.0
9,137.0,60.0,1.0,0.0,133.0,101.7,14.6,0.5,68.5,33.15,...,23.5,1.13,0.015,32.3,138.05,0.125,36.0,98.45,68.5,8.0


In [11]:
X_test = test_df.drop(['PatientID', 'Outcome', 'LOS'], axis=1) 
y_test_classification = test_df['Outcome']
y_test_regression = test_df['LOS']

In [12]:
best_regression_model = RandomForestRegressor(n_estimators=50, random_state=42, max_depth=5) 
best_regression_model.fit(X, y_regression)
test__regression_predictions = best_regression_model.predict(X_test)

# Performance metric 1
mse_test = mean_squared_error(y_test_regression, test__regression_predictions)
# Performance metric 2
evs = explained_variance_score(y_test_regression, test__regression_predictions)
# Performance metric 3
mae = mean_absolute_error(y_test_regression, test__regression_predictions)
print('LOS Predictions:')
print("Random Forest: Mean-squared error score, {}\n \t \t Explained variance score, {}\n \t \t Mean Absolute Error, {}".format(mse_test, evs, mae))


LOS Predictions:
Random Forest: Mean-squared error score, 28.33211687765665
 	 	 Explained variance score, 0.16509921147500317
 	 	 Mean Absolute Error, 3.8037560532800327


In [13]:
best_classification_model = xgb.XGBClassifier(booster= 'gbtree', n_estimators= 42, max_depth=5)
best_classification_model.fit(X, y_classification)
test_classification_predications = best_classification_model.predict(X_test)

# Performance metric 1
acc_test = accuracy_score(y_test_classification, test_classification_predications)
# Performance metric 2
precision = precision_score(y_test_classification, test_classification_predications)
recall = recall_score(y_test_classification, test_classification_predications)
f1 = f1_score(y_test_classification, test_classification_predications)
# Performance metric 3
cm = list(confusion_matrix(y_test_classification, test_classification_predications))
print('Outcome Predictions:')
print("XgBoost: Accuracy Score: {}\n \t (Precision, recall, f1): {}\n \t  Confusion Matrix: {}".format(acc_test, (precision, recall,f1), cm))



Outcome Predictions:
XgBoost: Accuracy Score: 1.0
 	 (Precision, recall, f1): (1.0, 1.0, 1.0)
 	  Confusion Matrix: [array([15,  0]), array([ 0, 15])]
