# Sepsis Train Test Split

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import random

In [0]:
data = pd.read_csv("sepsisData_A_B.csv")

Splitting patients into separate database if they were from Hospital A or Hospital B (the cutoff was decided from known filenames)

In [0]:
data_HospitalA = data[data.filename <= "p020643.psv"]
data_HospitalB = data[data.filename > "p020643.psv"]

In [0]:
print("Number of sepsis patients in Hospital A: " + str(len(np.unique(data_HospitalA[data_HospitalA.SepsisLabel==1].filename))))
print("Proportion of patients with sepsis in Hospital A: " + str(len(np.unique(data_HospitalA[data_HospitalA.SepsisLabel==1].filename))/20336*100) + "%")
print("Number of sepsis patients in Hospital B: " + str(len(np.unique(data_HospitalB[data_HospitalB.SepsisLabel==1].filename))))
print("Proportion of patients with sepsis in Hospital B: " + str(len(np.unique(data_HospitalB[data_HospitalB.SepsisLabel==1].filename))/20000*100) + "%")

Number of sepsis patients in Hospital A: 1790
Proportion of patients with sepsis in Hospital A: 8.802124311565697%
Number of sepsis patients in Hospital B: 1142
Proportion of patients with sepsis in Hospital B: 5.71%


In [0]:
def train_val_test_split(labels, description):

    train_val_labels, test_labels = train_test_split(labels, test_size = 0.2, random_state = 42)  # test is 20% of entire dataset
    train_labels, val_labels = train_test_split(train_val_labels, test_size = 0.25, random_state = 42) # 0.25 split means validation is 20% of entire dataset

    print(description,":\n", 
          "Training", "\tValidation", "\tTest", "\n",
          len(train_labels), "\t\t", len(val_labels), "\t\t", len(test_labels), "\n")

    return(train_labels, val_labels, test_labels)

In [0]:
sepsis_A = np.unique(data_HospitalA[data_HospitalA.SepsisLabel==1].filename)
train_A_sepsis, val_A_sepsis, test_A_sepsis = train_val_test_split(sepsis_A, "Hospital A, with sepsis")

nosepsis_A = np.unique(data_HospitalA[~data_HospitalA.filename.isin(np.unique(data_HospitalA[data_HospitalA.SepsisLabel==1].filename))].filename)
train_A_nosepsis, val_A_nosepsis, test_A_nosepsis = train_val_test_split(nosepsis_A, "Hospital A, no sepsis")

sepsis_B = np.unique(data_HospitalB[data_HospitalB.SepsisLabel==1].filename)
train_B_sepsis, val_B_sepsis, test_B_sepsis = train_val_test_split(sepsis_B, "Hospital B, with sepsis")

nosepsis_B = np.unique(data_HospitalB[~data_HospitalB.filename.isin(np.unique(data_HospitalB[data_HospitalB.SepsisLabel==1].filename))].filename)
train_B_nosepsis, val_B_nosepsis, test_B_nosepsis = train_val_test_split(nosepsis_B, "Hospital B, no sepsis")

print("\nTotal split:\n", 
     "Training", "\tValidation", "\tTest", "\n",
      sum([len(train_A_sepsis),len(train_A_nosepsis),len(train_B_sepsis),len(train_B_nosepsis)]), "\t\t", 
      sum([len(val_A_sepsis),len(val_A_nosepsis),len(val_B_sepsis),len(val_B_nosepsis)]), "\t\t", 
      sum([len(test_A_sepsis),len(test_A_nosepsis),len(test_B_sepsis),len(test_B_nosepsis)]), "\n")

Hospital A, with sepsis :
 Training 	Validation 	Test 
 1074 		 358 		 358 

Hospital A, no sepsis :
 Training 	Validation 	Test 
 11127 		 3709 		 3710 

Hospital B, with sepsis :
 Training 	Validation 	Test 
 684 		 229 		 229 

Hospital B, no sepsis :
 Training 	Validation 	Test 
 11314 		 3772 		 3772 


Total split:
 Training 	Validation 	Test 
 24199 		 8068 		 8069 



In [0]:
data['Hospital'] = np.where(data.filename <= "p020643.psv", "A", "B")
data.head()

Unnamed: 0,filename,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Hospital
0,p000001.psv,,,,,,,,,,...,,,83.14,0,,,-0.03,1,0,A
1,p000001.psv,97.0,95.0,,98.0,75.33,,19.0,,,...,,,83.14,0,,,-0.03,2,0,A
2,p000001.psv,89.0,99.0,,122.0,86.0,,22.0,,,...,,,83.14,0,,,-0.03,3,0,A
3,p000001.psv,90.0,95.0,,,,,30.0,,24.0,...,,,83.14,0,,,-0.03,4,0,A
4,p000001.psv,103.0,88.5,,122.0,91.33,,24.5,,,...,,,83.14,0,,,-0.03,5,0,A


In [0]:
train = np.concatenate([train_A_sepsis, train_A_nosepsis, train_B_sepsis, train_B_nosepsis]).tolist()
val = np.concatenate([val_A_sepsis, val_A_nosepsis, val_B_sepsis, val_B_nosepsis]).tolist()
test = np.concatenate([test_A_sepsis, test_A_nosepsis, test_B_sepsis, test_B_nosepsis]).tolist()

In [0]:
train_data = data[data.filename.isin(train)]
val_data = data[data.filename.isin(val)]
test_data = data[data.filename.isin(test)]

In [0]:
train_data.head()

Unnamed: 0,filename,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Hospital
154,p000005.psv,84.0,97.5,37.28,140.5,94.5,,17.5,,,...,,,28.09,1,1.0,0.0,-0.05,2,0,A
155,p000005.psv,80.0,99.0,,150.0,99.0,,18.0,,,...,,,28.09,1,1.0,0.0,-0.05,3,0,A
156,p000005.psv,74.0,97.0,37.22,142.0,103.0,,19.0,,,...,,,28.09,1,1.0,0.0,-0.05,4,0,A
157,p000005.psv,73.0,98.0,,144.0,99.0,,17.0,,,...,,,28.09,1,1.0,0.0,-0.05,5,0,A
158,p000005.psv,71.0,97.0,,144.0,,,17.0,,,...,,273.0,28.09,1,1.0,0.0,-0.05,6,0,A


In [0]:
train_data.to_csv('sepsis_train.csv')
val_data.to_csv('sepsis_val.csv')
test_data.to_csv('sepsis_test.csv')