In [None]:
from google.colab import drive
drive.mount('/content/drive')

# https://github.com/HCYENDLURI/Comparing-6-Classifiers-for-Sepsis-Dataset/blob/master/sepsis_lr.ipynb



Mounted at /content/drive


In [None]:
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
 
from sklearn.utils import resample

def class_balancer(df):
  grouped_df = df.groupby('Patient_ID')
  # Check if all sepsis label values for each patient are 1
  admitted_with_sepsis = grouped_df['SepsisLabel'].all()
  # Dataframe of patients who are admitted with sepsis
  admitted_with_sepsis_df = df[df['Patient_ID'].isin(admitted_with_sepsis[admitted_with_sepsis].index)]
  # list of patients admitted with sepsis
  admitted_with_sepsis_list = admitted_with_sepsis_df.Patient_ID.unique()
  # list of patients who have sepsis
  septic_patients_list =df['Patient_ID'][df['SepsisLabel']==1].unique()
  # dataframe of septic patients
  septic_df  = df[df.Patient_ID.isin(septic_patients_list)]
  mask = ~septic_df['Patient_ID'].isin(admitted_with_sepsis_list)
  developed_sepsis_df = septic_df[mask]
  # only get septic data and the 10 hours before this from the patients who developed sepsis
  result = pd.DataFrame(columns=developed_sepsis_df.columns)
  for patient_id, group in developed_sepsis_df.groupby('Patient_ID'):
    # find the index of the first row where SepsisLabel is 1
    sepsis_index = group.index[group['SepsisLabel'] == 1]
    # select the rows for this patient, starting 10 rows before sepsis_index
    start_index = max(sepsis_index.min() - 10, 0)
    end_index = sepsis_index.max()
    selected_rows = group.loc[start_index:]
    result = pd.concat([result, selected_rows], axis=0)
  # never get sepsis patients
  num_of_zeros = result.shape[0] + admitted_with_sepsis_df.shape[0]
  nosepsis = df[~df['Patient_ID'].isin(septic_patients_list)].sample(n=num_of_zeros)
  
  return pd.concat([admitted_with_sepsis_df, result, nosepsis]).reset_index(drop=True)

def data_pipe(dataset):
  dataset = dataset.drop('Patient_ID',axis=1)
 
  # Up Sampling
  df_majority = dataset[dataset.SepsisLabel==0]
  df_minority = dataset[dataset.SepsisLabel==1]

  # Down Sampling
  # df_minority = dataset[dataset.SepsisLabel==0]
  # df_majority = dataset[dataset.SepsisLabel==1]

  df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

  dataset = pd.concat([df_majority, df_minority_upsampled])

  y = dataset['SepsisLabel']
  X = dataset.drop('SepsisLabel',axis=1)

  labelencoder_Y = preprocessing.LabelEncoder()
  y = labelencoder_Y.fit_transform(y)
  return X,y

In [None]:
import pandas as pd

file = '/content/drive/MyDrive/DataScience/project/no_additional_features.zip (Unzipped Files)/train_set_interpolation_with_multivariate.csv'
df = pd.read_csv(file)
df2 = class_balancer(df)
X_train, y_train = data_pipe(df2)
 
# file = '/content/drive/MyDrive/DataScience/project/no_additional_features.zip (Unzipped Files)/val_set_interpolation_with_constant.csv'
# df = pd.read_csv(file)
# X_val, y_val = data_pipe(df)

file = '/content/drive/MyDrive/DataScience/project/no_additional_features.zip (Unzipped Files)/test_set_interpolation_with_multivariate.csv'
df = pd.read_csv(file)
df = df.drop('Patient_ID',axis=1)
y_test = df['SepsisLabel']
X_test = df.drop('SepsisLabel',axis=1)

In [None]:
df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,FiO2,...,Hgb,PTT,WBC,Platelets,Age,Gender,Hour,HospAdmTime,ICULOS,SepsisLabel
0,85.122941,97.122983,36.847779,124.980418,87.675668,69.192047,18.48414,-2.875918,22.550402,0.780955,...,10.782647,38.775468,10.582446,191.272694,49.0,1.0,0,-0.12,1,0
1,68.0,97.0,36.60596,104.0,84.0,77.0,18.0,-3.192458,21.842536,2.465079,...,9.5,39.2,9.8,90.0,49.0,1.0,1,-0.12,2,0
2,61.0,100.0,36.551504,103.0,79.0,72.0,15.0,-3.185734,21.846396,2.375771,...,9.491304,39.2,9.873913,92.347826,49.0,1.0,2,-0.12,3,0
3,67.0,100.0,36.66163,110.0,78.0,69.0,16.0,-3.206266,21.837977,2.171402,...,9.482609,39.2,9.947826,94.695652,49.0,1.0,3,-0.12,4,0
4,61.0,100.0,32.0,94.0,76.0,71.0,17.0,-2.916778,22.162556,4.037484,...,9.473913,39.2,10.021739,97.043478,49.0,1.0,4,-0.12,5,0


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, f1_score,roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier

classifiers = [
    # MLPClassifier(
    # activation='tanh',
    # solver='lbfgs',
    # early_stopping=False,
    # hidden_layer_sizes=(40,10,10,10,10, 2),
    # random_state=1,
    # batch_size='auto',
    # max_iter=13000,
    # learning_rate_init=1e-5,
    # tol=1e-4), F1 Score:  [0.87949282 0.08519042]
    AdaBoostClassifier(),
    RandomForestClassifier(max_depth =5, min_samples_split = 6, min_samples_leaf = 7,n_estimators=1700),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier(),
    ]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss", "F1 Score 0", "F1 Score 1"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.4%}".format(acc))

     
    print("F1 Score: ",f1_score(y_test,y_pred,average = None))

    print("ROC AUC Score: ", roc_auc_score(y_test,y_pred))

    y_pred_proba = clf.predict_proba(X_test)
    ll = log_loss(y_test, y_pred_proba)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll,f1_score(y_test,y_pred,average = None)[0],f1_score(y_test,y_pred,average = None)[1]]], columns=log_cols)
    log = log.append(log_entry)


    
    # predictions = np.zeros(y_pred.shape[0])
    # for i in range(0,y_pred.shape[0]):
    #   if abs(y_pred[i,0])<0.5:
    #     predictions[i] = 0
    #   else:
    #       predictions[i]=1
    
    plt.figure(figsize=(3,3))
    CM = confusion_matrix(y_pred,y_test)
    ConfusionMatrixDisplay.from_predictions(y_pred, y_test, display_labels=['No Sepsis','Sepsis'], cmap='Blues')
    plt.show()

    
print("="*30)

In [None]:
import matplotlib.patches as mpatches

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=log, color="g")

plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()

# sns.set_color_codes("muted")
# sns.barplot(x='F1 Score', y='Classifier', data=log, color="g")


# bar chart 1 -> top bars (f1score 0  )
bar1 = sns.barplot(x="F1 Score 0",  y="Classifier", data=log, color='darkblue')

# bar chart 2 -> bottom bars (f1 score 1)
bar2 = sns.barplot(x="F1 Score 1", y="Classifier", data=log, estimator=sum,  color='lightblue')


top_bar = mpatches.Patch(color='darkblue', label='F1 score for 0')
bottom_bar = mpatches.Patch(color='lightblue', label='F1 score for 1')
plt.legend(handles=[top_bar, bottom_bar])
plt.xlabel('F1 Score')
plt.title('Classifier F1 Score')
plt.show()

In [None]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
params = {"objective": "multi:softprob", "tree_method": "gpu_hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
)

In [None]:
results.keys()

Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',

      'train-auc-std', 'train-merror-mean', 'train-merror-std',

      'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',

      'test-auc-std', 'test-merror-mean', 'test-merror-std'],

     dtype='object')