In [103]:
import os
import torch
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm

from utils import calc_accuracy, calc_f1_score
from preprocessing import preprocessing, read_dataframes

train_data_path = './data/train'
test_data_path = './data/test'

vitals_features = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2']
lab_features = ['BaseExcess','HCO3','FiO2','pH','PaCO2','SaO2','AST','BUN','Alkalinephos','Calcium','Chloride','Creatinine','Bilirubin_direct','Glucose','Lactate','Magnesium','Phosphate','Potassium','Bilirubin_total','TroponinI','Hct','Hgb','PTT','WBC','Fibrinogen','Platelets']
demographic_features = ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']
target = 'SepsisLabel'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [114]:
class ICUSepsisDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, features=None, limit=None):        
        _dfs = read_dataframes(data_dir, limit)
        self.df = preprocessing(_dfs, interactions=False, dropna=False)
        
        if features:
            self.features = features
        else:
            missing_value_rate = (self.df.isna().sum() / len(self.df)).sort_values()
            self.features = list(missing_value_rate[missing_value_rate <= 0.22].index)
            self.features.remove(target)
            self.target = target

        self.df.dropna(subset=self.features, inplace=True)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        X, y = self.df.iloc[idx][self.features].to_numpy(), self.df.iloc[idx][target]
        return torch.from_numpy(X).float(), torch.Tensor([y]).long()

In [115]:
train_dataset = ICUSepsisDataset(train_data_path)
test_dataset = ICUSepsisDataset(test_data_path, features=train_dataset.features)

100%|██████████| 20000/20000 [01:14<00:00, 269.27it/s]


Found 20000 dataframes in ./data/train


100%|██████████| 20000/20000 [02:25<00:00, 137.10it/s]
100%|██████████| 10000/10000 [00:33<00:00, 301.99it/s]


Found 10000 dataframes in ./data/test


100%|██████████| 10000/10000 [01:11<00:00, 139.31it/s]


In [139]:
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512)

In [160]:
class SepsisPredictionModel4(torch.nn.Module):
    def __init__(self, input_size):
        super(SepsisPredictionModel4, self).__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(input_size, 56),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.4),
            torch.nn.Linear(56, 56),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(56, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 2),
            torch.nn.ReLU(),
        )

    def forward(self, x):
        x = self.mlp(x)
        return x

class SepsisPredictionModel5(torch.nn.Module):
    def __init__(self, input_size):
        super(SepsisPredictionModel5, self).__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(input_size, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.33),
            torch.nn.BatchNorm1d(128),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.33),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 2),
            torch.nn.ReLU(),
        )

    def forward(self, x):
        x = self.mlp(x)
        return x


model = SepsisPredictionModel5(input_size=len(train_dataset.features))
model.to(device)

SepsisPredictionModel5(
  (mlp): Sequential(
    (0): Linear(in_features=52, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.33, inplace=False)
    (3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.33, inplace=False)
    (7): Linear(in_features=64, out_features=32, bias=True)
    (8): ReLU()
    (9): Linear(in_features=32, out_features=2, bias=True)
    (10): ReLU()
  )
)

In [161]:
loss = torch.nn.CrossEntropyLoss(weight=torch.Tensor([1, 5]))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 250
loss_mem = []

model.train()
for epoch in range(EPOCHS):
    train_loss = 0.0
    i = 0
    for x,y in train_loader:
        i += 1
        optimizer.zero_grad()

        x = x.to(device)
        y = y.to(device).squeeze()

        # Forward pass
        output = model(x)

        L = loss(output, y)
        train_loss += L.item() * x.size(0)
        
        loss_mem.append(float(train_loss))

        # Backpropagation
        L.backward()
        optimizer.step()

    train_loss /= len(train_loader)

    print(f'Epoch {epoch+1}/{EPOCHS}, Loss {train_loss}')
    
    # print('-> Saving state')
    # torch.save(model.state_dict(), args.output_path)

Epoch 1/250, Loss 317.89735648424727
Epoch 2/250, Loss 281.3067148250082
Epoch 3/250, Loss 272.4550819526548
Epoch 4/250, Loss 267.0612734193387
Epoch 5/250, Loss 268.25270726110625
Epoch 6/250, Loss 265.02155021480894
Epoch 7/250, Loss 261.9246261845464
Epoch 8/250, Loss 261.66709162230075
Epoch 9/250, Loss 264.4745526547017
Epoch 10/250, Loss 257.7874801599461
Epoch 11/250, Loss 263.19827308602953
Epoch 12/250, Loss 263.44754368325937
Epoch 13/250, Loss 258.17974337546724
Epoch 14/250, Loss 261.0695016967214
Epoch 15/250, Loss 257.43992287179697
Epoch 16/250, Loss 257.8487406632175
Epoch 17/250, Loss 258.85401060270226
Epoch 18/250, Loss 255.42729353904724
Epoch 19/250, Loss 256.5625149452168
Epoch 20/250, Loss 257.6919973907263
Epoch 21/250, Loss 254.78246214337972
Epoch 22/250, Loss 252.03402283528575
Epoch 23/250, Loss 244.59785151611203
Epoch 24/250, Loss 245.5572893127151
Epoch 25/250, Loss 243.8438140646271
Epoch 26/250, Loss 242.3246797154779
Epoch 27/250, Loss 244.86957682863

In [162]:
y_true = []
y_predict = []

model.eval()
with tqdm(total=len(test_loader)) as pbar:
    with torch.no_grad():
        for x,y in test_loader:
            # ignore invalid samples

            x = x.to(device)
            y = y.to(device).squeeze()

            # Forward pass
            output = model(x)
            prediction = torch.softmax(output, dim=1).argmax(dim=1)

            y_true += list(y.flatten())
            y_predict += list(prediction.flatten())

            pbar.update(y.shape[0])

y_true = np.array(y_true)
y_predict = np.array(y_predict)

5759it [00:05, 1112.28it/s]           


In [163]:
calc_accuracy(y_predict, y_true)

0.9053655148463274

In [164]:
calc_f1_score(y_predict, y_true)

tp =  228
fp =  292
fn =  253
tn =  4986


0.45554445554445555

In [165]:
torch.save(model.state_dict, 'nn_005.state')