In [219]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

DATA_PATH = "./resource"
# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [220]:
import pickle
import numpy as np

def save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)
    print(" [*] save %s" % path)

def load_pkl(path):
  with open(path,'rb') as f:
    obj = pickle.load(f)
    print(" [*] load %s" % path)
    return obj

def save_npy(path, obj):
  np.save(path, obj)
  print(" [*] save %s" % path)

def load_npy(path):
  obj = np.load(path)
  print(" [*] load %s" % path)
  return obj

In [221]:
vocab = load_pkl(DATA_PATH + '/vocab.pkl')
TOTAL_NUM_CODES = len(vocab)
TOTAL_NUM_CODES

 [*] load ./resource/vocab.pkl


490

In [222]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    
    def __init__(self):
        self._data = load_pkl(DATA_PATH + '/data.pkl')
        self._label = load_pkl(DATA_PATH + '/label.pkl')


    
    def __len__(self):
        """ return the number of samples (i.e. patients). """
        return len(self._data)
    
    def __getitem__(self, index):
        data = self._data[index]
        label = self._label[index]
        return data, label

In [224]:
dataset = CustomDataset()
print('Size of dataset:', len(dataset))

 [*] load ./resource/data.pkl
 [*] load ./resource/label.pkl
Size of dataset: 3000


In [225]:
def collate_fn(data):

    sequences, labels = zip(*data)

    num_patients = len(sequences)
    num_visits = [len(patient) for patient in sequences]
    num_codes = [len(visit) for patient in sequences for visit in patient]

    max_num_visits = max(num_visits)
    max_num_codes = max(num_codes)

    y = torch.zeros((num_patients, max_num_visits), dtype=torch.float)

    x = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.long)
    rev_x = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.long)
    masks = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.bool)
    rev_masks = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.bool)
    l = torch.zeros((num_patients), dtype=torch.long)
    for i_patient, patient in enumerate(sequences):
        for j_visit, visit in enumerate(patient):
            x[i_patient,j_visit,0:len(visit)] = torch.Tensor(visit)
            masks[i_patient,j_visit,0:len(visit)] = torch.ones(len(visit))
            rev_j = len(patient) - j_visit - 1
            rev_x[i_patient,rev_j,0:len(visit)] = torch.Tensor(visit)
            rev_masks[i_patient,rev_j,0:len(visit)] = torch.ones(len(visit))
            y[i_patient,j_visit] = labels[i_patient][j_visit]
        l[i_patient] = len(patient)

    
    return x, masks, rev_x, rev_masks, y, l

In [234]:
from torch.utils.data.dataset import random_split

split = int(len(dataset)*0.8)

lengths = [split, len(dataset) - split]
train_dataset, val_dataset = random_split(dataset, lengths)

print("Length of train dataset:", len(train_dataset))
print("Length of val dataset:", len(val_dataset))

Length of train dataset: 2400
Length of val dataset: 600


In [235]:
from torch.utils.data import DataLoader

def load_data(train_dataset, val_dataset, collate_fn):
    
     
    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

    
    return train_loader, val_loader


train_loader, val_loader = load_data(train_dataset, val_dataset, collate_fn)

In [228]:
def sum_embeddings_with_mask(x, masks):
    x_masked = x * torch.unsqueeze(masks,3)
    return torch.sum(x_masked, axis=2)

In [229]:
def applyMask(hidden_states, masks):
    a = torch.sum(masks, axis = 2)
    a = a>0
    a = a.unsqueeze(-1)

    return torch.mul(hidden_states,a)

In [230]:
HIDDEN_SIZE=128
EMBEDDING_DIM=256


class EmbRNN(nn.Module):
    
    def __init__(self, num_codes):
        super().__init__()
        self.embedding = None
        self.rnn = None
        self.rev_rnn = None
        self.fc = None
        self.sigmoid = None
        
        # your code here
        self.embedding = nn.Embedding(num_embeddings=num_codes, embedding_dim=EMBEDDING_DIM)
        self.rnn = nn.GRU(input_size=EMBEDDING_DIM,hidden_size=HIDDEN_SIZE, batch_first=True)
        self.rev_rnn = nn.GRU(input_size=HIDDEN_SIZE,hidden_size=HIDDEN_SIZE, batch_first=True)
        self.fc = nn.Linear(in_features=2*HIDDEN_SIZE,out_features=1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x, masks, rev_x, rev_masks):
        """
        Arguments:
            x: the diagnosis sequence of shape (batch_size, # visits, # diagnosis codes)
            masks: the padding masks of shape (batch_size, # visits, # diagnosis codes)

        Outputs:
            probs: probabilities of shape (batch_size)
        """
        
        batch_size = x.shape[0]
        
        # 1. Pass the sequence through the embedding layer;
        x = self.embedding(x)
        # 2. Sum the embeddings for each diagnosis code up for a visit of a patient.
        x = sum_embeddings_with_mask(x, masks)
        
        # 3. Pass the embegginds through the RNN layer;
        output, _ = self.rnn(x)
        # 4. Obtain the hidden state at the last visit.
        true_h_n= applyMask(output,masks)
        
        true_h_n_rev = None
        rev_x = self.embedding(rev_x)
        rev_x = sum_embeddings_with_mask(rev_x, rev_masks)
        rev_output, _ = self.rnn(rev_x)
        true_h_n_rev = applyMask(rev_output,rev_masks)
        
        # 6. Pass the hidden state through the linear and activation layers.
        logits = self.fc(torch.cat([true_h_n, true_h_n_rev], 2))  
        probs = self.sigmoid(logits)
        return probs.squeeze(-1)
    

# load the model here
model = EmbRNN(num_codes = TOTAL_NUM_CODES)
model

EmbRNN(
  (embedding): Embedding(490, 256)
  (rnn): GRU(256, 128, batch_first=True)
  (rev_rnn): GRU(128, 128, batch_first=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [233]:
LEARNING_RATE = 0.01
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(rnn.parameters(),lr=LEARNING_RATE)
from sklearn.metrics import *

#input: Y_score,Y_pred,Y_true
#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_score, Y_pred, Y_true):
    acc, auc, precision, recall, f1score = accuracy_score(Y_true, Y_pred), \
                                           roc_auc_score(Y_true, Y_score), \
                                           precision_score(Y_true, Y_pred), \
                                           recall_score(Y_true, Y_pred), \
                                           f1_score(Y_true, Y_pred)
    return acc, auc, precision, recall, f1score


#input: model, loader
def evaluate(model, loader):
    model.eval()
    all_y_true = torch.LongTensor()
    all_y_pred = torch.LongTensor()
    all_y_score = torch.FloatTensor()
    for x, masks, rev_x, rev_masks, y, l in loader:
        # pass the input through the model
        y_hat = model(x, masks, rev_x, rev_masks)
        y_pred = (y_hat > 0.5).type(torch.float)
        for i in range(y.shape[0]):
            all_y_true = torch.cat((all_y_true, y[i,:l[i]].to('cpu').flatten()), dim=0)
            all_y_pred = torch.cat((all_y_pred,  y_pred[i,:l[i]].to('cpu').flatten()), dim=0)
            all_y_score = torch.cat((all_y_score,  y_hat[i,:l[i]].to('cpu').flatten()), dim=0)
        
    acc, auc, precision, recall, f1 = classification_metrics(all_y_score.detach().numpy(), 
                                                             all_y_pred.detach().numpy(), 
                                                             all_y_true.detach().numpy())
    print(f"acc: {acc:.3f}, auc: {auc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")
    return

In [236]:
import time
import resource 

n_epochs = 6
time_start = time.perf_counter()


for epoch in range(n_epochs):

    model.train()

    train_loss = 0
    for x, masks, rev_x, rev_masks, y, l in train_loader:
        optimizer.zero_grad()
        y_hat = model(x, masks, rev_x, rev_masks)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
    train_loss = train_loss / len(train_loader)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    evaluate(model, train_loader)
    evaluate(model, val_loader)
    
time_elapsed = (time.perf_counter() - time_start)
memMb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.0/1024.0
print ("%5.1f secs %5.1f MByte" % (time_elapsed,memMb))

Epoch: 1 	Training Loss: 0.665273
acc: 0.628, auc: 0.498, precision: 0.235, recall: 0.281, f1: 0.256
acc: 0.637, auc: 0.505, precision: 0.236, recall: 0.276, f1: 0.254
Epoch: 2 	Training Loss: 0.665340
acc: 0.628, auc: 0.498, precision: 0.235, recall: 0.281, f1: 0.256
acc: 0.637, auc: 0.505, precision: 0.236, recall: 0.276, f1: 0.254
Epoch: 3 	Training Loss: 0.665327
acc: 0.628, auc: 0.498, precision: 0.235, recall: 0.281, f1: 0.256
acc: 0.637, auc: 0.505, precision: 0.236, recall: 0.276, f1: 0.254
Epoch: 4 	Training Loss: 0.665236
acc: 0.628, auc: 0.498, precision: 0.235, recall: 0.281, f1: 0.256
acc: 0.637, auc: 0.505, precision: 0.236, recall: 0.276, f1: 0.254
Epoch: 5 	Training Loss: 0.665427
acc: 0.628, auc: 0.498, precision: 0.235, recall: 0.281, f1: 0.256
acc: 0.637, auc: 0.505, precision: 0.236, recall: 0.276, f1: 0.254
Epoch: 6 	Training Loss: 0.665294
acc: 0.628, auc: 0.498, precision: 0.235, recall: 0.281, f1: 0.256
acc: 0.637, auc: 0.505, precision: 0.236, recall: 0.276, f1