In [15]:
from gensim.models import Word2Vec
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

DATA_PATH = "./resource"
# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)


In [16]:
def read_csv(filename):
    """ reading csv from filename """
    data = []
    with open(filename, "r") as file:
        csv_reader = csv.DictReader(file, delimiter=',')
        for row in csv_reader:
            data.append(row)
    header = list(data[0].keys())
    return header, data


def to_one_hot(label, num_class):
    """ convert to one hot label """
    one_hot_label = [0] * num_class
    for i in label:
        one_hot_label[i] = 1
    return one_hot_label

In [17]:
import pickle
import numpy as np

def save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)
    print(" [*] save %s" % path)

def load_pkl(path):
  with open(path,'rb') as f:
    obj = pickle.load(f)
    print(" [*] load %s" % path)
    return obj

def save_npy(path, obj):
  np.save(path, obj)
  print(" [*] save %s" % path)

def load_npy(path):
  obj = np.load(path)
  print(" [*] load %s" % path)
  return obj

In [21]:
vocab = load_pkl(DATA_PATH + '/vocab.pkl')
TOTAL_NUM_CODES = len(vocab)
TOTAL_NUM_CODES

 [*] load ./resource/vocab.pkl


490

In [22]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    
    def __init__(self):
        self._data = load_pkl(DATA_PATH + '/data.pkl')
        self._label = load_pkl(DATA_PATH + '/label.pkl')


    
    def __len__(self):
        """ return the number of samples (i.e. patients). """
        return len(self._data)
    
    def __getitem__(self, index):
        data = self._data[index]
        label = self._label[index]
        x = torch.tensor([to_one_hot(visit, TOTAL_NUM_CODES) for visit in data], dtype=torch.float32)
        y = torch.tensor(label, dtype=torch.float32)
        return x, y

In [23]:
dataset = CustomDataset()
print('Size of dataset:', len(dataset))

 [*] load ./resource/data.pkl
 [*] load ./resource/label.pkl
Size of dataset: 3000


In [24]:
def collate_fn(data):

    sequences, labels = zip(*data)

    
    num_patients = len(sequences)
    num_visits = [patient.shape[0] for patient in sequences]
    total_num_codes = sequences[0].shape[1]

    max_num_visits = max(num_visits)
    #max_num_visits = MAX_LENGTH
    
    x = torch.zeros((num_patients, max_num_visits, total_num_codes), dtype=torch.float)
    y = torch.zeros((num_patients, max_num_visits), dtype=torch.float)
    #masks = torch.zeros((num_patients, max_num_visits, total_num_codes), dtype=torch.bool)
    l = torch.zeros((num_patients), dtype=torch.long)

    for i_patient, patient in enumerate(sequences):
        for j_visit, visit in enumerate(patient):
            x[i_patient,j_visit,:] = visit
            y[i_patient,j_visit] = labels[i_patient][j_visit]
        l[i_patient] = len(patient)
 
    return x, y,l

In [25]:
from torch.utils.data.dataset import random_split


split = int(len(dataset)*0.7)

lengths = [split, len(dataset) - split]
train_dataset, test_dataset = random_split(dataset, lengths)

print("Length of train dataset:", len(train_dataset))
print("Length of test dataset:", len(test_dataset))

Length of train dataset: 2100
Length of test dataset: 900


In [26]:
from torch.utils.data import DataLoader

# how many samples per batch to load
batch_size = 64

# prepare dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=collate_fn)

print("# of train batches:", len(train_loader))
print("# of test batches:", len(test_loader))

# of train batches: 33
# of test batches: 15


In [27]:
N_HIDDEN = 128
class GRUModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.rnn = nn.GRU(input_size = TOTAL_NUM_CODES,hidden_size = N_HIDDEN, batch_first=True)
        self.l = nn.Linear(N_HIDDEN,1)
        self.act = nn.Sigmoid()
    
    def forward(self, x):
         
        hs,_ = self.rnn(x)
        o = self.act(self.l(hs))
        o = o.squeeze(dim = -1)
        return o
    

# load the model here
model = GRUModel()
model

GRUModel(
  (rnn): GRU(490, 128, batch_first=True)
  (l): Linear(in_features=128, out_features=1, bias=True)
  (act): Sigmoid()
)

In [28]:
criterion = nn.BCELoss()
LEARNING_RATE = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [29]:
from sklearn.metrics import *

#input: Y_score,Y_pred,Y_true
#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_score, Y_pred, Y_true):
    acc, auc, precision, recall, f1score = accuracy_score(Y_true, Y_pred), \
                                           roc_auc_score(Y_true, Y_score), \
                                           precision_score(Y_true, Y_pred), \
                                           recall_score(Y_true, Y_pred), \
                                           f1_score(Y_true, Y_pred)
    return acc, auc, precision, recall, f1score


#input: model, loader
def evaluate(model, loader):
    model.eval()
    all_y_true = torch.LongTensor()
    all_y_pred = torch.LongTensor()
    all_y_score = torch.FloatTensor()
    for x, y,l in loader:
        # pass the input through the model
        y_hat = model(x)
        y_pred = (y_hat > 0.5).type(torch.float)
        for i in range(y.shape[0]):
            all_y_true = torch.cat((all_y_true, y[i,:l[i]].to('cpu').flatten()), dim=0)
            all_y_pred = torch.cat((all_y_pred,  y_pred[i,:l[i]].to('cpu').flatten()), dim=0)
            all_y_score = torch.cat((all_y_score,  y_hat[i,:l[i]].to('cpu').flatten()), dim=0)
        
    acc, auc, precision, recall, f1 = classification_metrics(all_y_score.detach().numpy(), 
                                                             all_y_pred.detach().numpy(), 
                                                             all_y_true.detach().numpy())
    print(f"acc: {acc:.3f}, auc: {auc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")
    return

In [30]:
n_epochs = 6

model.train()

for epoch in range(n_epochs):
    
    train_loss = 0
    for x, y,l in train_loader:
        optimizer.zero_grad()
        y_hat = model(x)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
    train_loss = train_loss / len(train_loader)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    evaluate(model, train_loader)
    evaluate(model, test_loader)

Epoch: 1 	Training Loss: 0.451584
acc: 0.773, auc: 0.640, precision: 0.850, recall: 0.004, f1: 0.008
acc: 0.775, auc: 0.644, precision: 0.861, recall: 0.004, f1: 0.008
Epoch: 2 	Training Loss: 0.266927
acc: 0.781, auc: 0.660, precision: 0.550, recall: 0.196, f1: 0.289
acc: 0.780, auc: 0.669, precision: 0.531, recall: 0.201, f1: 0.292
Epoch: 3 	Training Loss: 0.221783
acc: 0.796, auc: 0.723, precision: 0.657, recall: 0.214, f1: 0.323
acc: 0.794, auc: 0.725, precision: 0.626, recall: 0.220, f1: 0.326
Epoch: 4 	Training Loss: 0.201023
acc: 0.805, auc: 0.756, precision: 0.738, recall: 0.224, f1: 0.344
acc: 0.804, auc: 0.751, precision: 0.710, recall: 0.224, f1: 0.341
Epoch: 5 	Training Loss: 0.186731
acc: 0.820, auc: 0.780, precision: 0.757, recall: 0.309, f1: 0.439
acc: 0.819, auc: 0.774, precision: 0.736, recall: 0.306, f1: 0.432
Epoch: 6 	Training Loss: 0.181649
acc: 0.824, auc: 0.785, precision: 0.774, recall: 0.321, f1: 0.454
acc: 0.821, auc: 0.778, precision: 0.741, recall: 0.314, f1