<h3> Training BiLSTM Model </h3>

In [15]:
import pandas as pd
import os
import numpy as np
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

<h4> Loading all the data </h4>

In [16]:
tag = "_v3"
disease_symptoms = pd.read_csv(f"data/top_N_filtered{tag}.csv")
with open(f"data/filtered_symptom_dict{tag}.csv", 'r') as f:
    symptom_dict = data = json.load(f)

with open(f"data/icd9_dict{tag}.csv", 'r') as f:
    icd9_dict = json.load(f)

tfidf_weights = pd.read_csv(f"data/weight_i_j{tag}.csv")
tfidf_weights.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7349,7350,7351,7352,7353,7354,7355,7356,7357,7358
0,64.037076,66.621906,0.0,0.0,0.0,0.0,0.0,21.012165,0.0,0.0,...,0.0,0.0,1.714798,0.0,1.347074,0.0,0.0,0.0,1.609438,1.609438
1,147.918974,151.587765,0.0,0.0,0.0,0.0,0.0,5.002897,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.609438,1.609438,0.0,0.0,0.0
2,53.280848,56.115823,0.0,0.0,0.0,0.0,0.0,17.259993,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.609438,1.609438
3,16.509559,16.926467,0.0,0.0,0.0,0.0,0.0,4.085699,0.0,0.0,...,1.609438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20.428494,20.762021,0.0,0.0,0.0,0.0,0.0,3.752172,0.0,0.0,...,0.0,1.832581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h3> Define Custom DataLoader </h3>

In [17]:
from torch.utils.data import Dataset

MAX_SYMPTOMS = 50
MAX_DISEASE = 50


class CustomDataset(Dataset):
    
    def __init__(self, filename):        
        # read in the data files
        self.hadm_list = self.process_raw_data(filename)
        
        # TF-IDF Weights
        self.tfidf_weights = pd.read_csv(f"data/weight_i_j{tag}.csv")
        
        # Symptom dictionary
        with open(f"data/filtered_symptom_dict{tag}.csv", 'r') as f:
            self.symptom_dict = json.load(f)
        
        with open(f"data/icd9_dict{tag}.csv", 'r') as f:
            self.icd9_dict = json.load(f)
        
        
    def process_raw_data(self, filename):
        symptom_disease_data = pd.read_csv(filename)
        hadm_list = []
        hadm_id_map = {}
        hadm_index = 0
        # Collecting all records for one admission in one list
        for index, record in symptom_disease_data.iterrows():
            # print(f"processing {index} - {record}")
            hadm_id = record['HADM_ID']            
            if hadm_id not in hadm_id_map:
                hadm_id_map[hadm_id] = hadm_index
                hadm_index += 1
                hadm_list.append([])

            idx = hadm_id_map[hadm_id]
            hadm_list[idx].append(record)
        
        return hadm_list
        
    def __len__(self):
        return len(self.hadm_list)
    
    def __getitem__(self, index):
        """
            Output:
            symptpm_vector : max_number_of_symptoms (50) x number_of_diagnoses (50)
            diagnoses_vector = number_of_diagnoses
            symptom_count = number of symtoms for current record
        """
        list_of_records = self.hadm_list[index]
        symptom_string = list_of_records[0]['SYMPTOMS']
        # print(f"length of records : {len(list_of_records)}")
        symptom_vector = np.zeros((MAX_SYMPTOMS, MAX_DISEASE))
        # print(f"Processing item with symptom vector of : {symptom_string}")
        diag_vector = np.zeros((MAX_DISEASE))
        symptom_list = self.create_symptom_vector(symptom_string, self.symptom_dict)
        # print(f"symptom_list : {symptom_list}")
        # Populate Symptom Vector by getting corresponding embeddings from TF-IDF vector
        for index, symptom_idx in enumerate(symptom_list):
            # print(f"Symptom vector index: {index}, symptom index : {symptom_idx} \n {self.tfidf_weights.iloc[:,symptom_idx]}")
            symptom_vector[index] = self.tfidf_weights.iloc[:,symptom_idx]
            
        # Populate disease vector   
        for index, record in enumerate(list_of_records):
            icd_code = record['ICD9_3CHAR']
            if icd_code in self.icd9_dict:
                
                diagnosis_index = self.icd9_dict[icd_code]
                diag_vector[diagnosis_index] = 1
                # print(f"icd code : {icd_code}, diagnosis_index : {diagnosis_index}")
        return torch.tensor(symptom_vector, dtype=torch.float), torch.tensor(diag_vector, dtype=torch.float), len(symptom_list)
    
    def create_symptom_vector(self, symptoms, filtered_symptom_dict):
        symp_index_list = []
        symp_list = str(symptoms).split("|")    
        # only consider notes with symptoms count more than 1
        if len(symp_list) > 1:
            for symptom in symp_list[:50]:
                if symptom in filtered_symptom_dict:
                    symp_index_list.append(filtered_symptom_dict[symptom])
        return symp_index_list


In [27]:
dataset = CustomDataset(f"data/top_N_filtered{tag}.csv")
train_size = int(len(dataset)*0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=400, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=400)

In [28]:
print(f"Dataset size : {len(dataset)}")
symptom_item, diag_item, symptom_len = dataset[8]
print(f"symptom_item : {symptom_item.shape}")
print(f"diag_item : {diag_item}")
symptom_item

Dataset size : 44047
symptom_item : torch.Size([50, 50])
diag_item : tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [9]:
len(test_dataset)

8810

<h3> Define BiLSRM Model </h3>

In [63]:
class DiseaseSymptomLstm(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(DiseaseSymptomLstm, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=0.8)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, symp_length):
        # h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)
        # c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)
        # print(f"x.shape : {x.shape}") 
        # x.shape : torch.Size([32, 50, 50])
        
        out, (ht, ct) = self.bilstm(x)
        
        # print(f"out.shape : {out.shape}, ht.shape : {ht.shape}, ct.shape : {ct.shape}")
        # returning last layer of output from hidden state
        out = self.fc(out[:,-1,:])
        out = self.sigmoid(out)
        
        return out

model = DiseaseSymptomLstm(50, 100, 2, 50)

<h3> Training and inferencing </h3>

In [64]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [65]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
DISEASE_THRESHOLD = 0.20

def eval(model, test_loader):
    
    """    
    INPUT:
        model: model
        test_loader: dataloader
        
    OUTPUT:
        precision: overall micro precision score
        recall: overall micro recall score
        f1: overall micro f1 score
        
    REFERENCE: checkout https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
    """

    model.eval()
    y_pred = torch.LongTensor()
    y_true = torch.LongTensor()
    model.eval()
    for sequences, labels, symp_len in test_loader:
        # your code here
        y_prob = model(sequences, symp_len)
        y_hat = (y_prob > DISEASE_THRESHOLD).int()
        # print(f"y_prob: {y_hat}")
        # print(f"labels: {labels}")
        y_pred = torch.cat((y_pred,  y_hat.detach().to('cpu')), dim=0)
        y_true = torch.cat((y_true, labels.detach().to('cpu')), dim=0)
    
    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
    auc = roc_auc_score(y_true, y_pred, average='micro')
    return p, r, f, auc

In [66]:
def train(model, train_loader, test_loader, n_epochs):
    """    
    INPUT:
        model: the model
        train_loader: dataloder
        val_loader: dataloader
        n_epochs: total number of epochs
    """
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for sequences, y_true, symp_len in train_loader:
            optimizer.zero_grad()
            # print(f"sequence.shape {sequences.shape}, len: {symp_len}")
            y_hat = model(sequences, symp_len)
#             print(f"y_hat : {y_hat}")
#             print(f"y_true : {y_true}")
            
            loss = criterion(y_hat, y_true)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        print('Epoch: {} \t Training Loss: {:.6f}'.format(epoch+1, train_loss))
        p, r, f, auc = eval(model, test_loader)
        print('Epoch: {} \t Validation p: {:.2f}, r:{:.2f}, f: {:.2f}, auc: {:.2f}'.format(epoch+1, p, r, f, auc))

    
# number of epochs to train the model
n_epochs = 10

train(model, train_loader, test_loader, n_epochs)

Epoch: 1 	 Training Loss: 0.402175
Epoch: 1 	 Validation p: 0.29, r:0.43, f: 0.34, auc: 0.64
Epoch: 2 	 Training Loss: 0.341957
Epoch: 2 	 Validation p: 0.32, r:0.50, f: 0.39, auc: 0.67
Epoch: 3 	 Training Loss: 0.324790
Epoch: 3 	 Validation p: 0.34, r:0.50, f: 0.41, auc: 0.68
Epoch: 4 	 Training Loss: 0.315602
Epoch: 4 	 Validation p: 0.36, r:0.50, f: 0.42, auc: 0.68
Epoch: 5 	 Training Loss: 0.306922
Epoch: 5 	 Validation p: 0.39, r:0.50, f: 0.44, auc: 0.69
Epoch: 6 	 Training Loss: 0.301950
Epoch: 6 	 Validation p: 0.38, r:0.53, f: 0.44, auc: 0.70
Epoch: 7 	 Training Loss: 0.299680
Epoch: 7 	 Validation p: 0.38, r:0.53, f: 0.44, auc: 0.70
Epoch: 8 	 Training Loss: 0.298493
Epoch: 8 	 Validation p: 0.41, r:0.49, f: 0.45, auc: 0.69
Epoch: 9 	 Training Loss: 0.297255
Epoch: 9 	 Validation p: 0.39, r:0.52, f: 0.45, auc: 0.70
Epoch: 10 	 Training Loss: 0.295887
Epoch: 10 	 Validation p: 0.39, r:0.53, f: 0.45, auc: 0.70


n_epochs = 2, learning rate = 0.001 <br>
<code>
Epoch: 1 	 Training Loss: 0.332956
Epoch: 1 	 Validation p: 0.35, r:0.51, f: 0.42, auc: 0.69
Epoch: 2 	 Training Loss: 0.303654
Epoch: 2 	 Validation p: 0.40, r:0.51, f: 0.45, auc: 0.70
</code>
         <code>
Epoch: 1 	 Training Loss: 0.298307
Epoch: 1 	 Validation p: 0.40, r:0.53, f: 0.45, auc: 0.71
Epoch: 2 	 Training Loss: 0.294352
Epoch: 2 	 Validation p: 0.38, r:0.57, f: 0.46, auc: 0.72
Epoch: 3 	 Training Loss: 0.290162
Epoch: 3 	 Validation p: 0.40, r:0.55, f: 0.46, auc: 0.72
Epoch: 4 	 Training Loss: 0.286585
Epoch: 4 	 Validation p: 0.39, r:0.57, f: 0.47, auc: 0.72
Epoch: 5 	 Training Loss: 0.283719
Epoch: 5 	 Validation p: 0.40, r:0.58, f: 0.47, auc: 0.73      
    </code>

<h5>n_epochs=5, learning_rate = 0.005</h5>
<code>
Epoch: 1 	 Training Loss: 0.292470
Epoch: 1 	 Validation p: 0.36, r:0.59, f: 0.45, auc: 0.72
Epoch: 2 	 Training Loss: 0.288039
Epoch: 2 	 Validation p: 0.37, r:0.62, f: 0.46, auc: 0.73
Epoch: 3 	 Training Loss: 0.284519
Epoch: 3 	 Validation p: 0.38, r:0.60, f: 0.47, auc: 0.73
Epoch: 4 	 Training Loss: 0.282809
Epoch: 4 	 Validation p: 0.37, r:0.63, f: 0.46, auc: 0.74
Epoch: 5 	 Training Loss: 0.281407
Epoch: 5 	 Validation p: 0.38, r:0.61, f: 0.47, auc: 0.74
</code>

With Batch size of 400, starting fresh!
<code>
Epoch: 1 	 Training Loss: 0.400608
Epoch: 1 	 Validation p: 0.28, r:0.40, f: 0.33, auc: 0.63
Epoch: 2 	 Training Loss: 0.347110
Epoch: 2 	 Validation p: 0.35, r:0.36, f: 0.35, auc: 0.63
Epoch: 3 	 Training Loss: 0.326768
Epoch: 3 	 Validation p: 0.37, r:0.44, f: 0.40, auc: 0.67
Epoch: 4 	 Training Loss: 0.319100
Epoch: 4 	 Validation p: 0.37, r:0.46, f: 0.41, auc: 0.67
Epoch: 5 	 Training Loss: 0.312817
Epoch: 5 	 Validation p: 0.36, r:0.54, f: 0.43, auc: 0.70
    
Epoch: 1 	 Training Loss: 0.304074
Epoch: 1 	 Validation p: 0.38, r:0.52, f: 0.44, auc: 0.70
Epoch: 2 	 Training Loss: 0.300410
Epoch: 2 	 Validation p: 0.39, r:0.51, f: 0.44, auc: 0.70
Epoch: 3 	 Training Loss: 0.299053
Epoch: 3 	 Validation p: 0.38, r:0.54, f: 0.45, auc: 0.71
Epoch: 4 	 Training Loss: 0.297172
Epoch: 4 	 Validation p: 0.40, r:0.52, f: 0.45, auc: 0.70
Epoch: 5 	 Training Loss: 0.294421
Epoch: 5 	 Validation p: 0.38, r:0.56, f: 0.45, auc: 0.71
</code>

Number of layers in BiLSRT as 4. Very slow as well!
<code>
Epoch: 1 	 Training Loss: 0.691968
Epoch: 1 	 Validation p: 0.13, r:1.00, f: 0.23, auc: 0.50
Epoch: 2 	 Training Loss: 0.691965
Epoch: 2 	 Validation p: 0.13, r:1.00, f: 0.23, auc: 0.50
</code>

Number of layers in BiLSRT as 3. Very slow as well!
<code>
Epoch: 1 	 Training Loss: 0.329327
Epoch: 1 	 Validation p: 0.30, r:0.56, f: 0.39, auc: 0.69
Epoch: 2 	 Training Loss: 0.320431
Epoch: 2 	 Validation p: 0.33, r:0.53, f: 0.41, auc: 0.69
Epoch: 3 	 Training Loss: 0.317443
Epoch: 3 	 Validation p: 0.35, r:0.49, f: 0.41, auc: 0.68
Epoch: 4 	 Training Loss: 0.315925
Epoch: 4 	 Validation p: 0.34, r:0.52, f: 0.41, auc: 0.69
Epoch: 5 	 Training Loss: 0.311251
Epoch: 5 	 Validation p: 0.38, r:0.49, f: 0.43, auc: 0.69
</code>