LSTM example: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [9]:
import torch
import pandas as pd
import numpy as np
import os
import utils
import tqdm
from scipy.interpolate import interp1d

In [10]:
class ICUSepsisDataset(torch.utils.data.Dataset):
    features = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP','Resp', 'Age', 'Gender']
    target = 'SepsisLabel'
    physiological = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP','Resp']
    def __init__(self, path):
        self.files = [os.path.join(path, f) for f in os.listdir(path)]
        print(f'Found {len(self.files)} files in {path}')
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, i):
        data = pd.read_csv(self.files[i], sep='|')
        if data[ICUSepsisDataset.features].isna().all(axis=0).any():
            # some physiological feature is missing
            # currently, just return None,None
            # TODO: implement better imputation mechanism
            return None, None

        for f in ICUSepsisDataset.physiological:
            _x = data[f][~data[f].isna()]
            try:
                interp = interp1d(_x.index, _x.values, fill_value='extrapolate', kind='nearest')
                data[f] = interp(data[f].index)
            except:
                return None, None

        # Remove the data after the first sepsis trigger
        if (data[ICUSepsisDataset.target] == 1).any():
            first_sepsis_idx = data[ICUSepsisDataset.target].idxmax()
            data = data[:first_sepsis_idx+1]

        X = data[ICUSepsisDataset.features].to_numpy()
        y = data[ICUSepsisDataset.target].to_numpy()
        
        assert len(X) == len(y)
        
        if np.isnan(X).any() or np.isnan(y).any():
            raise ValueError('DF has missing values', i, data)
        return torch.from_numpy(X).float(), torch.from_numpy(y)
        


train_data_path = '/home/student/data/train'
test_data_path = '/home/student/data/test'

icu_train = ICUSepsisDataset(train_data_path)
icu_test = ICUSepsisDataset(test_data_path)

train_dataloader = torch.utils.data.DataLoader(icu_train, shuffle=True, batch_size=1)
test_dataloader = torch.utils.data.DataLoader(icu_test, shuffle=True, batch_size=1)


Found 20000 files in /home/student/data/train
Found 10000 files in /home/student/data/test


In [11]:
class SepsisPredictionModel(torch.nn.Module):
    def __init__(self, input_size, hidden_dim=100):
        super(SepsisPredictionModel, self).__init__()
        self.lstm = torch.nn.LSTM(input_size, hidden_size=hidden_dim, batch_first=True, num_layers=2)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim, 100), 
            torch.nn.ReLU(), 
            torch.nn.Linear(100, 20), 
            torch.nn.ReLU(), 
            torch.nn.Linear(20, 2)
        )
        self.log_softmax = torch.nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.mlp(x)
        x = self.log_softmax(x)
        return x

model = SepsisPredictionModel(input_size=len(ICUSepsisDataset.features))
print(model)

SepsisPredictionModel(
  (lstm): LSTM(9, 100, num_layers=2, batch_first=True)
  (mlp): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
  (log_softmax): LogSoftmax(dim=1)
)


In [12]:
EPOCHS = 2
loss = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters())
for epoch in range(EPOCHS):
    with tqdm.tqdm(total=len(icu_train)) as pbar:
        for x,y in icu_train:
            if x is None or y is None:
                continue
            model.zero_grad()
            y_prob = model(x)
            L = loss(y_prob, y)
            L.backward()
            optimizer.step()
            pbar.update(1)
    print(f'Epoch {epoch+1}/{EPOCHS+1}, Loss {L}')
    

 46%|████▌     | 9235/20000 [04:07<04:49, 37.24it/s]


KeyboardInterrupt: 

In [19]:
y_prob.squeeze()

tensor([-0.0059, -5.1343], grad_fn=<SqueezeBackward0>)

In [None]:
total = 0
correct = 0
for x,y in test_dataloader:
    _y = model(x)
    correct += sum(_y.argmax(dim=2).flatten() == y.flatten())
    print(correct)
    total += len(y)
    break

In [None]:
def calc_accuracy(data_loader, model, sample_size=None):
    if sample_size is None:
        sample_size = len(data_loader)
    total = 0
    correct = 0
    i = 0
    for x,y in data_loader:
        i += 1
        _y = model(x)
        correct += sum(_y.argmax(dim=2).flatten() == y.flatten())
        total += len(y.flatten())
        i += 1
        if i >= sample_size:
            break
    
    return correct / total

calc_accuracy(test_dataloader, model, sample_size=10)