# Sepsis Competition - BMEG400D
## Training a Model
## By Sergei Issaev

### Import Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
RANDOM_STATE = 74

### Load in Data

In [3]:
df_train = pd.read_csv('training.csv')
df_test = pd.read_csv('testing.csv')
df_all = pd.concat([df_train, df_test], sort=True).reset_index(drop=True)

### Simple EDA

In [4]:
df_train.head()

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
0,0,65.71,24.701493,1.561401,62.834031,0.519281,132.268194,19.145484,30.893636,10.273107,...,97.045014,4.129589,19.145484,121.760289,0,0,36.878981,11.407253,0.0,7.383057
1,1,65.71,14.0,0.8,52.0,0.519281,132.268194,18.0,27.6,10.273107,...,97.0,4.129589,18.0,107.5,0,0,36.878981,11.407253,1.0,7.383057
2,2,65.71,24.701493,1.561401,61.5,0.519281,253.0,19.5,30.893636,10.273107,...,98.5,5.0,19.5,124.5,0,0,36.78,11.407253,2.0,7.36
3,3,65.71,24.701493,1.561401,58.5,0.519281,132.268194,17.0,30.893636,10.273107,...,96.5,4.129589,17.0,117.5,0,0,36.878981,11.407253,3.0,7.383057
4,4,65.71,24.701493,1.561401,61.0,0.519281,132.268194,26.0,30.893636,10.273107,...,100.0,4.129589,26.0,125.0,0,0,36.878981,11.407253,4.0,7.383057


In [5]:
print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['SepsisLabel'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
print(df_train.columns)
print(df_test.columns)

Number of Training Examples = 199393
Number of Test Examples = 38713

Training X Shape = (199393, 23)
Training y Shape = 199393

Test X Shape = (38713, 23)
Test y Shape = 38713

Index(['level_0', 'Age', 'BUN', 'Creatinine', 'DBP', 'FiO2', 'Glucose', 'HR',
       'Hct', 'Hgb', 'ICULOS', 'MAP', 'Magnesium', 'O2Sat', 'Potassium',
       'Resp', 'SBP', 'SepsisLabel', 'Sex', 'Temp', 'WBC', 'index', 'pH'],
      dtype='object')
Index(['level_0', 'Age', 'BUN', 'Creatinine', 'DBP', 'FiO2', 'Glucose', 'HR',
       'Hct', 'Hgb', 'ICULOS', 'MAP', 'Magnesium', 'O2Sat', 'Potassium',
       'Resp', 'SBP', 'SepsisLabel', 'Sex', 'Temp', 'WBC', 'index', 'pH'],
      dtype='object')


### Biological Interpretation of the Columns

age = age <br>
bun = blood urea nitrogen <br>
creatinine = lab value <br>
dbp = diastolic bp <br>
fi02 = fraction of inspired oxygen<br>
glucose = serum glucose<br>
hr = heart rate<br>
hct = hematocrit<br>
hgb = hemoglobin<br>
iculos = hours since icu admit <br>
map = mean arterial bp<br>
magnesium = lab value<br>
o2sat = pulse oximetry<br>
potassium = lab value <br>
resp = respiration rate<br>
sbp = systolic bp<br>
sepsislabel = target value<br>
sex = gender<br>
temp = temperature<br>
wbc = leukocyte count<br>
pH = acidity<br>


In [6]:
print(df_train.info())
df_train.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199393 entries, 0 to 199392
Data columns (total 23 columns):
level_0        199393 non-null int64
Age            199393 non-null float64
BUN            199393 non-null float64
Creatinine     199393 non-null float64
DBP            199393 non-null float64
FiO2           199393 non-null float64
Glucose        199393 non-null float64
HR             199393 non-null float64
Hct            199393 non-null float64
Hgb            199393 non-null float64
ICULOS         199393 non-null int64
MAP            199393 non-null float64
Magnesium      199393 non-null float64
O2Sat          199393 non-null float64
Potassium      199393 non-null float64
Resp           199393 non-null float64
SBP            199393 non-null float64
SepsisLabel    199393 non-null int64
Sex            199393 non-null int64
Temp           199393 non-null float64
WBC            199393 non-null float64
index          71 non-null float64
pH             199393 non-null float64
dtyp

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
46471,46471,63.9,24.701493,1.561401,61.0,0.519281,132.268194,17.0,30.893636,10.273107,...,97.0,4.129589,17.0,106.0,0,0,36.878981,11.407253,,7.383057
188638,188638,75.0,24.701493,1.561401,64.0,0.519281,132.268194,22.0,30.893636,10.273107,...,99.0,4.129589,22.0,108.0,0,0,36.878981,11.407253,,7.383057
175,175,39.28,24.701493,1.561401,65.0,0.519281,132.268194,26.0,30.893636,10.273107,...,95.0,4.129589,26.0,93.0,0,1,36.878981,11.407253,,7.383057


In [7]:
print(df_test.info())
df_test.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38713 entries, 0 to 38712
Data columns (total 23 columns):
level_0        38713 non-null int64
Age            38713 non-null float64
BUN            38713 non-null float64
Creatinine     38713 non-null float64
DBP            38713 non-null float64
FiO2           38713 non-null float64
Glucose        38713 non-null float64
HR             38713 non-null float64
Hct            38713 non-null float64
Hgb            38713 non-null float64
ICULOS         38713 non-null int64
MAP            38713 non-null float64
Magnesium      38713 non-null float64
O2Sat          38713 non-null float64
Potassium      38713 non-null float64
Resp           38713 non-null float64
SBP            38713 non-null float64
SepsisLabel    38713 non-null int64
Sex            38713 non-null int64
Temp           38713 non-null float64
WBC            38713 non-null float64
index          61 non-null float64
pH             38713 non-null float64
dtypes: float64(19), int64(4

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
9894,9894,81.25,12.0,0.7,36.0,0.519281,72.5,17.0,23.4,8.2,...,97.0,4.8,17.0,102.0,0,0,36.878981,10.6,,7.44
8814,8814,52.41,24.701493,1.561401,59.0,0.5,137.0,12.0,30.893636,10.273107,...,100.0,4.1,12.0,100.0,0,1,37.45,11.407253,,7.33
2321,2321,70.28,24.701493,1.561401,62.834031,0.5,132.268194,26.0,30.893636,10.273107,...,100.0,4.129589,26.0,102.0,0,0,36.878981,11.407253,,7.383057


In [8]:
for col in df_train.columns.tolist():          
    print('{} column missing values: {}'.format(col, df_train[col].isnull().sum()))

level_0 column missing values: 0
Age column missing values: 0
BUN column missing values: 0
Creatinine column missing values: 0
DBP column missing values: 0
FiO2 column missing values: 0
Glucose column missing values: 0
HR column missing values: 0
Hct column missing values: 0
Hgb column missing values: 0
ICULOS column missing values: 0
MAP column missing values: 0
Magnesium column missing values: 0
O2Sat column missing values: 0
Potassium column missing values: 0
Resp column missing values: 0
SBP column missing values: 0
SepsisLabel column missing values: 0
Sex column missing values: 0
Temp column missing values: 0
WBC column missing values: 0
index column missing values: 199322
pH column missing values: 0


In [9]:
for col in df_test.columns.tolist():          
    print('{} column missing values: {}'.format(col, df_train[col].isnull().sum()))

level_0 column missing values: 0
Age column missing values: 0
BUN column missing values: 0
Creatinine column missing values: 0
DBP column missing values: 0
FiO2 column missing values: 0
Glucose column missing values: 0
HR column missing values: 0
Hct column missing values: 0
Hgb column missing values: 0
ICULOS column missing values: 0
MAP column missing values: 0
Magnesium column missing values: 0
O2Sat column missing values: 0
Potassium column missing values: 0
Resp column missing values: 0
SBP column missing values: 0
SepsisLabel column missing values: 0
Sex column missing values: 0
Temp column missing values: 0
WBC column missing values: 0
index column missing values: 199322
pH column missing values: 0


### Rearrange columns

In [10]:
trainer = df_train[["ICULOS", "HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "FiO2", "pH", "BUN", "Creatinine", "Glucose", "Magnesium", "Potassium", "Hct", "Hgb", "WBC", "Age", "Sex"]]

In [11]:
trainertest = df_test[["ICULOS", "HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "FiO2", "pH", "BUN", "Creatinine", "Glucose", "Magnesium", "Potassium", "Hct", "Hgb", "WBC", "Age", "Sex"]]

In [12]:
trainer.sample(3)

Unnamed: 0,ICULOS,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,FiO2,pH,BUN,Creatinine,Glucose,Magnesium,Potassium,Hct,Hgb,WBC,Age,Sex
57253,6,10.0,100.0,37.5,141.0,101.0,79.0,10.0,0.5,7.42,16.0,0.8,101.0,2.047995,4.1,32.2,10.273107,11.407253,57.15,1
98353,87,17.0,97.0,36.878981,121.760289,81.555129,62.834031,17.0,0.519281,7.383057,24.701493,1.561401,132.268194,2.047995,4.129589,30.893636,10.273107,11.407253,33.0,0
186058,49,19.145484,96.0,37.2,131.0,89.0,59.0,19.145484,0.519281,7.383057,39.0,0.92,123.0,2.2,3.9,30.893636,10.273107,11.407253,76.0,1


### Train-Test Split

In [13]:
#Select columns
X = trainer.iloc[:, :].values
y = df_train.iloc[:, list(df_train.columns).index('SepsisLabel')].values

In [14]:
#Select columns
X_t = trainertest.iloc[:, :].values
y_t = df_test.iloc[:, list(df_train.columns).index('SepsisLabel')].values

In [15]:
# Feature scaling
#sc = StandardScaler()
#x_train = sc.fit_transform(x_train)
#x_test = sc.transform(x_test)

### Train Algorithm

In [18]:
#Set the minimum error arbitrarily large
maxim = 0
count = 0 #Used for keeping track of the iteration number
#How many runs to perform using randomly selected hyperparameters
iterations = 40
for i in range(iterations):
    print('iteration number', count)
    count += 1 #increment count
    try:
        d_train = lgb.Dataset(X, label=y) #Load in data
        params = {} #initialize parameters
        params['learning_rate'] = np.random.uniform(0, 0.5)
        params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
        params['objective'] = 'binary'
        params['metric'] = 'binary_logloss'
        params['sub_feature'] = np.random.uniform(0, 1)
        params['num_leaves'] = np.random.randint(20, 300)
        params['min_data'] = np.random.randint(10, 100)
        params['max_depth'] = np.random.randint(5, 200)
        iterations = np.random.randint(10, 750)
        print(params, iterations)#Train using selected parameters
        clf = lgb.train(params, d_train, iterations)
        y_pred=clf.predict(X_t) #Create predictions on test set
        y_predx = y_pred.round(0)
        y_predx = y_predx.astype(int)

        roc=roc_auc_score(y_predx, y_t)
        print('ROC:', roc)
        if roc > maxim:
            maxim = roc
            pp = params 
    except: #in case something goes wrong
        print('failed with')
        print(params)
print("*" * 50)
print('Maximum is: ', maxim)
print('Used params', pp)

iteration number 0
{'learning_rate': 0.004809088746856094, 'boosting_type': 'dart', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.7230893771244701, 'num_leaves': 250, 'min_data': 68, 'max_depth': 62} 113
failed with
{'learning_rate': 0.004809088746856094, 'boosting_type': 'dart', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.7230893771244701, 'num_leaves': 250, 'min_data': 68, 'max_depth': 62}
iteration number 1
{'learning_rate': 0.23954068735309497, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.44509753764214643, 'num_leaves': 130, 'min_data': 65, 'max_depth': 185} 121
ROC: 0.5464040274979388
iteration number 2
{'learning_rate': 0.28292252348228114, 'boosting_type': 'goss', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.9553110589813447, 'num_leaves': 45, 'min_data': 49, 'max_depth': 89} 699
ROC: 0.5236355393178281
iteration number 3
{'learning_rate': 0.18264687268218627, 'b

ROC: 0.5381077764124521
iteration number 32
{'learning_rate': 0.43412896707314286, 'boosting_type': 'goss', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.537624246968727, 'num_leaves': 248, 'min_data': 71, 'max_depth': 107} 519
ROC: 0.5236182863737882
iteration number 33
{'learning_rate': 0.3950095058429497, 'boosting_type': 'dart', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.3357127422742755, 'num_leaves': 42, 'min_data': 14, 'max_depth': 93} 466
ROC: 0.5289488644707088
iteration number 34
{'learning_rate': 0.4375192849115154, 'boosting_type': 'goss', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.7465971060930211, 'num_leaves': 219, 'min_data': 33, 'max_depth': 76} 84
ROC: 0.5209562033403283
iteration number 35
{'learning_rate': 0.35001174169458393, 'boosting_type': 'goss', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.7929110097570514, 'num_leaves': 76, 'min_data': 36, 'max_depth': 5} 185
ROC: 0

### Use Best Parameters to Define Model

In [19]:
params = {} #initialize parameters
params['learning_rate'] = 0.09631061598944796
params['boosting_type'] = 'goss'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.7050168519189473
params['num_leaves'] = 27
params['min_data'] = 29
params['max_depth'] = 150
iterations = 22
print(params, iterations)
clf = lgb.train(params, d_train, iterations)

{'learning_rate': 0.09631061598944796, 'boosting_type': 'goss', 'objective': 'binary', 'metric': 'binary_logloss', 'sub_feature': 0.7050168519189473, 'num_leaves': 27, 'min_data': 29, 'max_depth': 150} 22


### Make Predictions and Evaluate

In [20]:
y_pred=clf.predict(X_t)
y_predx = y_pred.round(0)
y_predx = y_predx.astype(int)
roc_auc_score(y_predx, y_t)

0.7773329664716828

In [21]:
clf.save_model('lgbm_model5.mdl')

<lightgbm.basic.Booster at 0x2c155b95ec8>