# Define and Train BiLSTM Model

Here we will define and train the BiLSTM model for input data obtained by using Word2Vec for symptom embeddings.

In [65]:
import pandas as pd
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import re
import gensim
import os

### Load Symptom <-> Disease Data and Word2Vec Model

We have prepared input output pairs for symptoms and diseases [(x1, y1), (x2, y2).....], and stored them as a dictionary. Here we will load that data which will act as training and testing dataset. Also, we will load the trained Word2Vec model which will be used to get the symptom embeddings.

In [66]:
cwd = os.getcwd()
data_dir = cwd + "/../data/"
model_dir = cwd + "/model/"
print(f"Current working directory : {cwd}")
print(f"Data directory : {data_dir}")
print(f"Model directory : {model_dir}")

# Load the symptom disease dictionary
with open(data_dir + "symptom_disease_dict.csv", 'r') as f:
    symptom_disease_dict = json.load(f)
len(symptom_disease_dict)

with open(data_dir + "icd9_dict.csv", 'r') as f:
    icd9_dict = json.load(f)

# Load a saved word2vec model 
word2vec_model = gensim.models.Word2Vec.load(model_dir + 'word2vec_model')

Current working directory : /Users/ratanbajpai/Education/UIUC/DLH/project/src
Data directory : /Users/ratanbajpai/Education/UIUC/DLH/project/src/../data/
Model directory : /Users/ratanbajpai/Education/UIUC/DLH/project/src/model/


In [67]:
# Convert dictionary to list of values i.e. [(x1, y1), (x2, y2).....]
symptom_disease_list = list(symptom_disease_dict.values())
len(symptom_disease_list)

43651

### Define Custom Dataset

In [78]:
# Create custom dataset

MAX_SYMPTOMS = 50
MAX_DISEASE = 50
NUM_FEATURES = 128


class CustomDataset(Dataset):
    
    def __init__(self, xypair_list, icd9_dict, model_w2v):
        
        # Set data and model variables
        self.xypair_list = xypair_list
        self.model_w2v = model_w2v
        self.icd9_dict = icd9_dict
        
    def __len__(self):
        return len(self.xypair_list)
    
    def create_symp_vec(self, symptom):
        symp_words = symptom.split()
        symp_vec = np.zeros(NUM_FEATURES, dtype=np.float64)
        for word in symp_words:
            # Get the word embedding
            key = re.sub(r"[^a-zA-Z0-9]","", word.lower())
            if key in self.model_w2v:
                symp_vec = symp_vec + self.model_w2v[key]
        symp_vec = symp_vec / len(symp_words)
        return symp_vec
    
    def __getitem__(self, index):
        """
            Output:
            symptpm vector (X) : length of each symptom vector (128) x max number of symptoms (50)
            diagnoses vector (Y) = diagnosed disease vector for these symptoms (50)
            symptom count = number of symtoms for current record
        """
        symptom_list = self.xypair_list[index][0]
        disease_list = self.xypair_list[index][1]
        
        # Create a 2d vector of symptoms (X)
        symp_vec = np.zeros((NUM_FEATURES, MAX_SYMPTOMS), dtype=np.float64)
        
        # We take max of 50 symptoms from each discharge summaries note
        for col, symptom in enumerate(symptom_list[:50]):
            symp_vec[:, col] = self.create_symp_vec(symptom)
            
        # Create a 1d vector of diseases (Y)
        disease_vec = np.zeros(MAX_DISEASE, dtype=np.float32)
        for disease in disease_list:
            if disease in self.icd9_dict.keys():
                index = self.icd9_dict[disease]
                disease_vec[index] = 1
        
        return torch.tensor(symp_vec.T, dtype=torch.float), torch.tensor(disease_vec, dtype=torch.float), len(symptom_list[:50])
    

In [79]:
# Instantiate custom dataset
dataset = CustomDataset(symptom_disease_list, icd9_dict, word2vec_model)
print('Dataset size: ', len(dataset))

# Define train and test sizes
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

Dataset size:  43651


In [80]:
train_dataset[0]

  if key in self.model_w2v:
  symp_vec = symp_vec + self.model_w2v[key]


(tensor([[ 0.1303, -0.0605,  0.1260,  ...,  0.0597,  0.0214, -0.2647],
         [-0.0374, -0.3420, -0.0076,  ..., -0.1074,  0.1338, -0.1693],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 tensor([0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 11)

In [81]:
print(f"Dataset size : {len(dataset)}")
symptom_item, diag_item, symptom_len = dataset[0]
print(f"symptom_item : {symptom_item.shape}")
print(f"diag_item : {diag_item}")
symptom_item

Dataset size : 43651
symptom_item : torch.Size([50, 128])
diag_item : tensor([1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])


  if key in self.model_w2v:
  symp_vec = symp_vec + self.model_w2v[key]


tensor([[ 0.0763, -0.1303, -0.2874,  ...,  0.3420, -0.5229, -0.1654],
        [-0.1053, -0.2726, -0.3009,  ...,  0.2681, -0.2480, -0.0882],
        [ 0.4527, -0.0143,  0.0306,  ..., -0.3816,  0.1335,  0.2715],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

### Define BiLSTM Model

In [82]:
class DiseaseSymptomLstm(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(DiseaseSymptomLstm, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=0.8)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, symp_length):
        # Perform forward step
        out, (ht, ct) = self.bilstm(x)
        
        # print(f"out.shape : {out.shape}, ht.shape : {ht.shape}, ct.shape : {ct.shape}")
        # returning last layer of output from hidden state
        out = self.fc(out[:,-1,:])
        out = self.sigmoid(out)
        # Return model output
        return out

model = DiseaseSymptomLstm(128, 256, 2, 50)

In [83]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

### Define Evaluation Metrics

In [84]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
DISEASE_THRESHOLD = 0.20

def eval(model, test_loader):
    
    """    
    INPUT:
        model: model
        test_loader: dataloader
        
    OUTPUT:
        precision: overall micro precision score
        recall: overall micro recall score
        f1: overall micro f1 score
        
    REFERENCE: checkout https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
    """

    model.eval()
    y_pred = torch.LongTensor()
    y_true = torch.LongTensor()
    model.eval()
    for sequences, labels, symp_len in test_loader:
        # your code here
        y_prob = model(sequences, symp_len)
        y_hat = (y_prob > DISEASE_THRESHOLD).int()
        # print(f"y_prob: {y_hat}")
        # print(f"labels: {labels}")
        y_pred = torch.cat((y_pred,  y_hat.detach().to('cpu')), dim=0)
        y_true = torch.cat((y_true, labels.detach().to('cpu')), dim=0)
    
    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
    auc = roc_auc_score(y_true, y_pred, average='micro')
    return p, r, f, auc

### Train BiLSTM Model

In [None]:
def train(model, train_loader, test_loader, n_epochs):
    """    
    INPUT:
        model: the model
        train_loader: dataloder
        val_loader: dataloader
        n_epochs: total number of epochs
    """
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for sequences, y_true, symp_len in train_loader:
            optimizer.zero_grad()
            # print(f"sequence.shape {sequences.shape}, len: {symp_len}")
            y_hat = model(sequences, symp_len)
#             print(f"y_hat : {y_hat}")
#             print(f"y_true : {y_true}")
            
            loss = criterion(y_hat, y_true)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        print('Epoch: {} \t Training Loss: {:.6f}'.format(epoch+1, train_loss))
        p, r, f, auc = eval(model, test_loader)
        print('Epoch: {} \t Validation p: {:.2f}, r:{:.2f}, f: {:.2f}, auc: {:.2f}'.format(epoch+1, p, r, f, auc))

    
# number of epochs to train the model
n_epochs = 5

train(model, train_loader, test_loader, n_epochs)

  if key in self.model_w2v:
  symp_vec = symp_vec + self.model_w2v[key]


Epoch: 1 	 Training Loss: 0.291704
Epoch: 1 	 Validation p: 0.39, r:0.57, f: 0.46, auc: 0.72


  if key in self.model_w2v:
  symp_vec = symp_vec + self.model_w2v[key]
