In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch-summary

Collecting torch-summary
  Downloading https://files.pythonhosted.org/packages/ca/db/93d18c84f73b214acfa4d18051d6f4263eee3e044c408928e8abe941a22c/torch_summary-1.4.5-py3-none-any.whl
Installing collected packages: torch-summary
Successfully installed torch-summary-1.4.5


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from ast import literal_eval
from torchsummary import summary
from tqdm import tqdm

In [4]:
from collections import Counter

In [5]:
import torch 
import torch.nn as nn
from torchtext import data
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
device

device(type='cuda')

In [8]:
!nvidia-smi

Thu May 27 17:34:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    26W / 300W |      2MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
SEED = 2020
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)

In [10]:
train_diagnosis = pd.read_csv('/content/drive/MyDrive/AlgoIntern/Data/Data2.0/discharge_diagnosis_icd_train')
test_diagnosis = pd.read_csv('/content/drive/MyDrive/AlgoIntern/Data/Data2.0/discharge_diagnosis_icd_test')

In [11]:
train_diagnosis

Unnamed: 0,HADM_ID,discharge_diagnosis,ICD9_CODE,ICD10,ICD9_CATEGORY,ICD10_CATEGORY
0,138999.0,",anoxic brain injury,vericeal bleeding,possibl...","['570', '311', '48242', '3481', '2760', '2874'...","['R569', 'F329', 'E870', 'I498', 'J9600', 'J96...","['518', '054', '570', '456', 'E915', '311', '2...","['R56', 'I49', 'F32', 'J96', 'E87', 'R00']"
1,154986.0,",1.rectosigmoid carcinoma,2.esophagitis/small ...","['V4577', 'V1582', '5533', '1540', '4019', '53...","['Z951', 'Z87891', 'J45909', 'E780', 'J45998',...","['578', '493', '530', '401', 'V15', '154', '27...","['J45', 'Z87', 'Z95', 'I10', 'E78']"
2,151638.0,",likely chronic eosinophilic pneumonia,copd, h...","['4280', '496', '4019', '41401', '5183', 'E879...","['I2510', 'I509', 'J449', 'Y848', 'E780', 'I95...","['458', 'E879', '496', '401', '346', '473', '4...","['I50', 'Y84', 'I95', 'I25', 'J44', 'I10', 'E78']"
3,160956.0,",hypotension secondary to hypovolemia,.,psoria...","['6960', '2550', '32723', '25000', '5718', '27...","['E860', 'E785', 'N179', 'E784', 'I952', 'I958...","['V58', '255', '696', '401', '272', 'E944', '5...","['E86', 'I95', 'E11', 'N17', 'Z79', 'G47', 'I1..."
4,157693.0,",?partial small bowel obstruction,lower gi ble...","['30590', 'V652', '4019', '5789', '5368', '346...","['D62', 'K922', 'J45909', 'D696', 'J45998', 'I...","['536', '578', '305', '493', '401', '346', 'V4...","['D62', 'J45', 'D69', 'K92', 'I10']"
...,...,...,...,...,...,...
33882,146688.0,",acute blood loss anemia/gi bleed,pancytopenia...","['20080', '72981', '28800', 'E8497', 'V1005', ...","['D62', 'N390', 'K922', 'D696', 'J918', 'J9600...","['200', 'E888', '427', '599', '578', '284', '4...","['D62', 'D69', 'N39', 'J91', 'I48', 'K92', 'J9..."
33883,177161.0,",- non-st elevation myocardial infarction,- ac...","['4240', '311', '99664', '7140', '2760', '2500...","['I5032', 'N170', 'Z66', 'I348', 'F329', 'N390...","['806', 'V58', '311', '410', '599', 'E878', '4...","['I34', 'Z66', 'I50', 'I95', 'N39', 'F32', 'N1..."
33884,177705.0,",idiopathic pericarditis w/ pericardial and pl...","['V1301', '49381', '4239', '2724', '5119', '42...","['E785', 'E784', 'J918']","['493', '420', '423', 'V13', '272', '511']","['J91', 'E78']"
33885,195465.0,",subdural hematomas,supratherapeutic inr,atria...","['V4572', '5853', 'V5861', 'E9177', '2449', '7...","['Z7901', 'I4891', 'I129', 'K219', 'E785', 'D6...","['244', '530', 'V58', 'E917', '427', '852', '2...","['D64', 'I12', 'I48', 'N17', 'K21', 'Z79', 'E0..."


In [12]:
test_diagnosis

Unnamed: 0,HADM_ID,discharge_diagnosis,ICD9_CODE,ICD10,ICD9_CATEGORY,ICD10_CATEGORY
0,100009.0,",coronary artery disease,cad-(ami UNK, UNK UNK...","['44021', 'V1582', '27800', '4019', '4262', '4...","['I2510', 'E669', 'Z87891', 'I200', 'E780', 'D...","['278', '401', 'V58', 'V85', '250', '440', 'V1...","['D64', 'E66', 'I20', 'E11', 'Z87', 'Z98', 'I2..."
1,100034.0,",coronary artery disease s/p coronary artery b...","['V1582', '6961', '4019', '41401', '412', '250...","['I2510', 'Z87891', 'E785', 'E784', 'I252', 'E...","['401', '250', '412', '413', 'V15', '414', '27...","['E11', 'Z87', 'I25', 'I10', 'E78']"
2,100044.0,", prematurity, respiratory distress syndrome,,...","['V3001', '7793', 'V053', '76519', '7717', '77...","['P002', 'P590', 'Z412', 'Z3801', 'P220', 'Z23']","['771', '769', 'V50', '779', 'V05', 'V29', 'V3...","['P22', 'Z41', 'Z23', 'P00', 'Z38', 'P59']"
3,100050.0,",aortic stenosis s/p aortic valve replacement,...","['4280', '41401', '53081', '99791', '5990', '9...","['I2510', 'N390', 'I509', 'I358', 'Z950', 'I35...","['997', '530', '424', '414', '998', '287', 'V4...","['I50', 'D69', 'N39', 'I35', 'I48', 'K21', 'Z9..."
4,100065.0,",diabetes mellitus ii with ketoacidosis,.,hype...","['3659', '4019', '25012', '5849', '32723']","['N179', 'I10', 'G4733']","['401', '250', '327', '365', '584']","['G47', 'I10', 'N17']"
...,...,...,...,...,...,...
14518,199987.0,",s/p motor vehicle crash,injuries:,multiple le...","['8056', '5180', '9190', '8082', '8604', '8070...","['J9811', 'J9819']","['808', 'E812', '805', '807', '860', '919', '5...",['J98']
14519,199988.0,",1. rectal cancer.,2. status post proctocole...","['E8792', '7070', '0389', '5672', '99859', '27...","['E780', 'A419']","['707', '997', '038', '998', '272', '154', 'E8...","['A41', 'E78']"
14520,199992.0,", aspiration pneumonia, dysphagia, hypothermia...","['E9352', '78720', '5070', '44382', '0539', '2...","['E870', 'I498', 'J690', 'N179', 'D649', 'G473...","['244', 'E936', 'E939', '507', '327', 'E935', ...","['J69', 'D64', 'I49', 'N17', 'E87', 'G47', 'R0..."
14521,199993.0,",1. cad status post redo cabg times five and ...","['41031', '4271', '5180', '4240', '42821', '51...","['I472', 'I348', 'J9819', 'E870', 'I340', 'J98...","['424', '276', '427', '428', '511', '518', '410']","['I34', 'J91', 'I48', 'J98', 'I47', 'E87']"


In [13]:
train_diagnosis['ICD9_CODE'] = train_diagnosis['ICD9_CODE'].apply(literal_eval)
train_diagnosis['ICD9_CATEGORY'] = train_diagnosis['ICD9_CATEGORY'].apply(literal_eval)
train_diagnosis['ICD10'] = train_diagnosis['ICD10'].apply(literal_eval)
train_diagnosis['ICD10_CATEGORY'] = train_diagnosis['ICD10_CATEGORY'].apply(literal_eval)

test_diagnosis['ICD9_CODE'] = test_diagnosis['ICD9_CODE'].apply(literal_eval)
test_diagnosis['ICD9_CATEGORY'] = test_diagnosis['ICD9_CATEGORY'].apply(literal_eval)
test_diagnosis['ICD10'] = test_diagnosis['ICD10'].apply(literal_eval)
test_diagnosis['ICD10_CATEGORY'] = test_diagnosis['ICD10_CATEGORY'].apply(literal_eval)


In [14]:
frequent_icd9category = ['401','427','276','414','272','250','428','518','285','584']
frequent_icd9code = ['4019', '4280', '42731', '41401', '5849', '25000', '2724', '51881', '5990', '53081']
frequent_icd10category = ['I10', 'I25', 'E78', 'I50', 'I48', 'N17', 'E87', 'E11', 'J96', 'N39']
frequent_icd10code = ['I10', 'I2510', 'I509', 'I4891', 'N179', 'E119', 'E784', 'E785', 'J9690', 'J9600']

## Character Embedding


In [15]:
vocabulary = list("""abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'"/\|_@#$%^&*~`+-=<>()[]{}""")

In [16]:
len(vocabulary)

68

In [17]:
print(vocabulary)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ',', ';', '.', '!', '?', ':', "'", '"', '/', '\\', '|', '_', '@', '#', '$', '%', '^', '&', '*', '~', '`', '+', '-', '=', '<', '>', '(', ')', '[', ']', '{', '}']


In [18]:
def character_index(sentence, vocabulary, sequence_length = 500):
  index_list = []
  for i in range(len(sentence)):
    if i > sequence_length-1:
      break
    else:
      if sentence[i] in vocabulary:
        index_list.append(vocabulary.index(sentence[i]) + 1)
      else :
        index_list.append(len(vocabulary)+1)
  if len(index_list) == sequence_length:
    return index_list
  else:
    index_list.extend([0]*(sequence_length-len(index_list)))
    return index_list

In [19]:
character_index('mrityunjay ', vocabulary, 11)

[13, 18, 9, 20, 25, 21, 14, 10, 1, 25, 69]

In [20]:
def character_embedding(index_list):
  embedding_weights = []
  for index,i in enumerate(index_list):
    one_hot = np.zeros(len(vocabulary)+1)
    if i != 0:
      one_hot[i-1] = 1
    embedding_weights.append(one_hot)
  return np.array(embedding_weights,dtype = 'float32').T



In [21]:
character_embedding(character_index('mrityunjay', vocabulary, 11))

array([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [22]:
def labeltarget(x,frequent_list):
  target=np.zeros(10,dtype="float32")
  for index,code in enumerate(frequent_list):
    if code in x :
      target[index]=1
  return target

In [23]:
labeltarget(['E81', '919', '807', '518', '805', '860', '276'], frequent_icd9category)

array([0., 0., 1., 0., 0., 0., 0., 1., 0., 0.], dtype=float32)

In [24]:
character_length=train_diagnosis['discharge_diagnosis'].apply(lambda x:len([i for i in x])).to_list()

In [25]:
character_length[:10]

[171, 73, 75, 178, 150, 107, 206, 20, 134, 141]

In [26]:
max(character_length)

2609

In [27]:
np.median(character_length)

100.0

In [28]:
np.mean(character_length)

137.43686369404196

In [29]:
sum(character_length)/len(character_length)

137.43686369404196

In [30]:
device

device(type='cuda')

In [31]:
class cnndataset(Dataset):
  def __init__(self, train_df, test_df, train = True):
    self.train_df = train_df
    self.test_df = test_df
    self.nsamples_train = len(train_df)
    self.nsamples_test = len(test_df)
    self.train = train
    

  def __getitem__(self,index):
    if self.train:
      x = torch.from_numpy(character_embedding(character_index(self.train_df['discharge_diagnosis'].iloc[index], vocabulary, 140)))
      y = {}
      y['icd9code'] = torch.from_numpy(labeltarget(self.train_df["ICD9_CODE"].iloc[index], frequent_icd9code))
      y['icd9cat'] = torch.from_numpy(labeltarget(self.train_df["ICD9_CATEGORY"].iloc[index], frequent_icd9category))
      y['icd10code'] = torch.from_numpy(labeltarget(self.train_df["ICD10"].iloc[index], frequent_icd10code))
      y['icd10cat'] = torch.from_numpy(labeltarget(self.train_df["ICD10_CATEGORY"].iloc[index], frequent_icd10category))
      return x, y
    
    x = torch.from_numpy(character_embedding(character_index(self.test_df['discharge_diagnosis'].iloc[index], vocabulary, 140)))
    y = {}
    y['icd9code'] = torch.from_numpy(labeltarget(self.test_df["ICD9_CODE"].iloc[index], frequent_icd9code))
    y['icd9cat'] = torch.from_numpy(labeltarget(self.test_df["ICD9_CATEGORY"].iloc[index], frequent_icd9category))
    y['icd10code'] = torch.from_numpy(labeltarget(self.test_df["ICD10"].iloc[index], frequent_icd10code))
    y['icd10cat'] = torch.from_numpy(labeltarget(self.test_df["ICD10_CATEGORY"].iloc[index], frequent_icd10category))
    return x, y

  def __len__(self):
    if self.train:
      return self.nsamples_train
    return self.nsamples_test


In [32]:
cnn_train_dataset = cnndataset(train_diagnosis, test_diagnosis, train = True)
cnn_test_dataset = cnndataset(train_diagnosis, test_diagnosis, train = False)

In [33]:
def split_indices(dataset, validation_split, shuffle_dataset = True, random_seed = 2021):
  dataset_size = len(dataset)
  indices = list(range(dataset_size))
  split = int(np.floor(validation_split * dataset_size))
  if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
  return indices[split:], indices[:split]

In [34]:
train_indices, val_indices = split_indices(cnn_train_dataset, validation_split=2/7)

In [35]:
batch_size = 64
train_sampler=SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

cnn_train_loader = DataLoader(cnn_train_dataset, batch_size = batch_size, sampler=train_sampler)
cnn_val_loader = DataLoader(cnn_train_dataset, batch_size = batch_size, sampler=val_sampler)



In [36]:
cnn_test_loader = DataLoader(cnn_test_dataset, batch_size= batch_size)

In [37]:
for x,y in cnn_train_loader:
  print(x.shape)
  print(y)
  
  break

torch.Size([64, 69, 140])
{'icd9code': tensor([[0., 1., 0., 0., 1., 0., 0., 1., 0., 0.],
        [0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 1., 0., 1., 1., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 1., 0., 0., 0., 1.],
        [0., 1., 0., 0., 1., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
        [1., 1., 0., 1., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 1., 0., 1., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0.

In [38]:
# sequential1=nn.Sequential(nn.Conv1d(65,128,3),nn.ReLU(),nn.MaxPool1d(3))

In [39]:
# x_in=torch.randn(128,65,150)

In [40]:
# x_out=sequential1(x_in)

In [41]:
# x_out.shape

In [42]:
# layer4=nn.Flatten()

In [43]:
# layer4(x_out).shape

In [44]:
import sklearn
from sklearn.metrics import accuracy_score,hamming_loss,precision_score,recall_score,f1_score,classification_report

In [45]:
def calculate_metrics(pred, target, threshold=0.5):
  pred = np.array(pred > threshold, dtype="float32")
  
  return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
            'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
            'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'),
            # 'macro/f1': f1_score(y_true=target, y_pred=pred, average='macro'),
            # 'samples/f1': f1_score(y_true=target, y_pred=pred, average='samples'),
          'hammingloss':hamming_loss(target,pred)       
             }


In [46]:
def train_metric(y_pred, y_test, threshold=0.5):
  num_classes = y_pred.shape[1]
  y_pred_tags = (y_pred>0.5).float()

  correct_pred = (y_pred_tags == y_test).float()
  accuracy = (correct_pred.sum(dim=1) == num_classes).float().sum() / len(correct_pred)

  hammingloss = hamming_loss(y_test.cpu().numpy(), y_pred_tags.cpu().numpy())

  f1score = f1_score(y_true=y_test.cpu().numpy(), y_pred=y_pred_tags.cpu().numpy(), average='micro')
  return accuracy, hammingloss, f1score

In [58]:
def fit(epochs,model,train_loader,val_loader, icdtype, opt_fn,loss_fn, learning_rate):
  optimizer = opt_fn(model.parameters(), lr=learning_rate)
  print('-'*10 + icdtype + '-'*10)
  for epoch in range(1,epochs+1):

    model.train()

    train_epoch_loss=0
    train_epoch_accuracy=0
    train_epoch_hammingloss=0
    train_epoch_f1score=0

    val_epoch_loss=0
    val_epoch_accuracy=0
    val_epoch_hammingloss=0
    val_epoch_f1score=0

    
    for x, y_dict in train_loader:

      x = x.to(device)

      y = y_dict[icdtype]
      y = y.to(device)

      
      preds=model(x)

      optimizer.zero_grad()
      loss=loss_fn(preds,y)
      loss.backward()
      optimizer.step()
      
      accuracy, hammingloss, f1score  = train_metric(preds,y)

      train_epoch_loss+=loss.item()
      train_epoch_accuracy+=accuracy.item()
      train_epoch_hammingloss+=hammingloss
      train_epoch_f1score+=f1score
    
    model.eval()
    with torch.no_grad():
      for x,y_dict in val_loader:
        
        x=x.to(device)

        y = y_dict[icdtype]
        y = y.to(device)

        
        preds=model(x)

        loss=loss_fn(preds,y)
        accuracy, hammingloss, f1score  = train_metric(preds,y)
        val_epoch_loss+=loss.item()
        val_epoch_accuracy+=accuracy.item()
        val_epoch_hammingloss+=hammingloss
        val_epoch_f1score+=f1score

    
  
    print("\n")
    print('-'*100)
    print('Epoch = {}/{}:\n train_loss = {:.4f}, train_accuracy = {:.4f}, train_hammingloss = {:.4f}, train_f1score = {:.4f}\n val_loss = {:.4f}, val_accuracy = {:.4f}, val_hammmingloss = {:.4f}, val_f1score = {:.4f}'.format(epoch
                                                              ,epochs
                                                              ,train_epoch_loss/len(train_loader)
                                                              ,train_epoch_accuracy/len(train_loader)
                                                              ,train_epoch_hammingloss/len(train_loader)
                                                              ,train_epoch_f1score/len(train_loader)
                                                              ,val_epoch_loss/len(val_loader)
                                                              ,val_epoch_accuracy/len(val_loader)
                                                              ,val_epoch_hammingloss/len(val_loader)
                                                              ,val_epoch_f1score/len(val_loader)
                                                              ))
    print('-'*100)
    print("\n")

In [48]:
def test_results(model, test_loader, icdtype):

  model.eval()
  with torch.no_grad():
    model_result = []
    targets = []
    for x_test, batch_targets in test_loader:
      x_test = x_test.to(device)
      model_batch_result = model(x_test)
      model_result.extend(model_batch_result.cpu().numpy())
      targets.extend(batch_targets[icdtype].cpu().numpy())
  result = calculate_metrics(np.array(model_result), np.array(targets))
  print('-'*10 + icdtype + '-'*10)
  print(result)

In [49]:
# class characterCNN(nn.Module):
#   def __init__(self):
#     super().__init__()
#     # layers of NN
#     self.conv=nn.Sequential(nn.Conv1d(65,128,kernel_size=3,padding=0),nn.ReLU(),nn.MaxPool1d(3))
#     self.flatten=nn.Flatten()
#     self.fc = nn.Linear(6272, 10)
#     # activation 
#     self.act = nn.Sigmoid()
  
#   # forward

#   def forward(self, x):
#     out = self.conv(x)
#     out = self.flatten(out)
#     out = self.fc(out)
#     out = self.act(out)
#     return out


    

In [65]:
class CharacterLevelCNN(nn.Module):
    def __init__(self, number_of_classes):
        super(CharacterLevelCNN, self).__init__()

        # define conv layers

        # self.dropout_input = nn.Dropout2d(args.dropout_input)

        self.conv1 = nn.Sequential(nn.Conv1d(69,
                                             128,
                                             kernel_size=5,
                                             padding=0),
                                   nn.ReLU(),
                                   nn.MaxPool1d(5)
                                   )

        self.conv2 = nn.Sequential(nn.Conv1d(128, 128, kernel_size=5, padding=0),
                                   nn.ReLU(),
                                   nn.MaxPool1d(3)
                                   )

        self.conv3 = nn.Sequential(nn.Conv1d(128, 128, kernel_size=5, padding=0),
                                   nn.ReLU()
                                   )

        # self.conv4 = nn.Sequential(nn.Conv1d(128, 128, kernel_size=3, padding=0),
        #                            nn.ReLU()
        #                            )

        # self.conv5 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0),
        #                            nn.ReLU()
        #                            )

        # self.conv6 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0),
        #                            nn.ReLU(),
        #                            nn.MaxPool1d(3)
        #                            )

        # compute the  output shape after forwarding an input to the conv layers

        input_shape = (128, 69, 140)
        self.output_dimension = self._get_conv_output(input_shape)

        # define linear layers

        self.fc1 = nn.Sequential(
            nn.Linear(self.output_dimension, 128),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

        # self.fc2 = nn.Sequential(
        #     nn.Linear(512, 128),
        #     nn.ReLU(),
        #     nn.Dropout(0.5)
        # )

        self.fc3 = nn.Linear(128, number_of_classes)


        # activation
        self.act = nn.Sigmoid()

        # initialize weights

        # self._create_weights()

    # utility private functions

    # def _create_weights(self, mean=0.0, std=0.05):
    #     for module in self.modules():
    #         if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
    #             module.weight.data.normal_(mean, std)


    def _get_conv_output(self, shape):
        x = torch.rand(shape)
        # x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        # x = self.conv4(x)
        # x = self.conv5(x)
        # x = self.conv6(x)
        x = x.view(x.size(0), -1)
        output_dimension = x.size(1)
        return output_dimension

    # forward

    def forward(self, x):
        # x = self.dropout_input(x)
        # x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        # x = self.conv4(x)
        # x = self.conv5(x)
        # x = self.conv6(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        # x = self.fc2(x)
        x = self.fc3(x)
        x = self.act(x)
        return x

In [66]:
modelcnn=CharacterLevelCNN(number_of_classes=10).to(device)

In [67]:
modelcnn

CharacterLevelCNN(
  (conv1): Sequential(
    (0): Conv1d(69, 128, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (1): ReLU()
  )
  (fc1): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
  )
  (fc3): Linear(in_features=128, out_features=10, bias=True)
  (act): Sigmoid()
)

In [68]:
summary(modelcnn, (69,140))

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 128, 27]             --
|    └─Conv1d: 2-1                       [-1, 128, 136]            44,288
|    └─ReLU: 2-2                         [-1, 128, 136]            --
|    └─MaxPool1d: 2-3                    [-1, 128, 27]             --
├─Sequential: 1-2                        [-1, 128, 7]              --
|    └─Conv1d: 2-4                       [-1, 128, 23]             82,048
|    └─ReLU: 2-5                         [-1, 128, 23]             --
|    └─MaxPool1d: 2-6                    [-1, 128, 7]              --
├─Sequential: 1-3                        [-1, 128, 3]              --
|    └─Conv1d: 2-7                       [-1, 128, 3]              82,048
|    └─ReLU: 2-8                         [-1, 128, 3]              --
├─Sequential: 1-4                        [-1, 128]                 --
|    └─Linear: 2-9                       [-1, 128]                 49,280

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 128, 27]             --
|    └─Conv1d: 2-1                       [-1, 128, 136]            44,288
|    └─ReLU: 2-2                         [-1, 128, 136]            --
|    └─MaxPool1d: 2-3                    [-1, 128, 27]             --
├─Sequential: 1-2                        [-1, 128, 7]              --
|    └─Conv1d: 2-4                       [-1, 128, 23]             82,048
|    └─ReLU: 2-5                         [-1, 128, 23]             --
|    └─MaxPool1d: 2-6                    [-1, 128, 7]              --
├─Sequential: 1-3                        [-1, 128, 3]              --
|    └─Conv1d: 2-7                       [-1, 128, 3]              82,048
|    └─ReLU: 2-8                         [-1, 128, 3]              --
├─Sequential: 1-4                        [-1, 128]                 --
|    └─Linear: 2-9                       [-1, 128]                 49,280

In [56]:
learning_rate = 1e-4
loss_fn = nn.BCELoss()
opt_fn = torch.optim.Adam


In [69]:
fit(100,modelcnn, cnn_train_loader, cnn_val_loader, 'icd9code', opt_fn,loss_fn, learning_rate)

----------icd9code----------


----------------------------------------------------------------------------------------------------
Epoch = 1/100:
 train_loss = 0.5261, train_accuracy = 0.1898, train_hammingloss = 0.2234, train_f1score = 0.0576
 val_loss = 0.4744, val_accuracy = 0.2196, val_hammmingloss = 0.2022, val_f1score = 0.0000
----------------------------------------------------------------------------------------------------




----------------------------------------------------------------------------------------------------
Epoch = 2/100:
 train_loss = 0.4746, train_accuracy = 0.2225, train_hammingloss = 0.2015, train_f1score = 0.0141
 val_loss = 0.4640, val_accuracy = 0.2199, val_hammmingloss = 0.2019, val_f1score = 0.0000
----------------------------------------------------------------------------------------------------




----------------------------------------------------------------------------------------------------
Epoch = 3/100:
 train_loss = 0.4642, train_accur

KeyboardInterrupt: ignored

In [70]:
test_results(modelcnn, cnn_test_loader, "icd9code")

----------icd9code----------
{'micro/precision': 0.6613989637305699, 'micro/recall': 0.26228553816650113, 'micro/f1': 0.375616095730855, 'hammingloss': 0.1753287888177374}


In [None]:
modelcnn=CharacterLevelCNN(number_of_classes=10).to(device)

In [None]:
fit(10,modelcnn, cnn_train_loader, cnn_val_loader, 'icd9cat', opt_fn,loss_fn, learning_rate)

----------icd9cat----------


--------------------------------------------------
Epoch = 5/10, train_loss = 0.4846, train_accuracy = 0.1725, val_loss = 0.5102, val_accuracy = 0.1666
--------------------------------------------------




--------------------------------------------------
Epoch = 10/10, train_loss = 0.4211, train_accuracy = 0.1963, val_loss = 0.5306, val_accuracy = 0.1511
--------------------------------------------------




In [None]:
test_results(modelcnn, cnn_test_loader, "icd9cat")

----------icd9cat----------
{'micro/precision': 0.5609482788808349, 'micro/recall': 0.3936355817668018, 'micro/f1': 0.4626292381447467, 'hammingloss': 0.25624182331474216}


In [None]:
modelcnn=CharacterLevelCNN(number_of_classes=10).to(device)

In [None]:
fit(10,modelcnn, cnn_train_loader, cnn_val_loader, 'icd10code', opt_fn,loss_fn, learning_rate)

----------icd10code----------


--------------------------------------------------
Epoch = 5/10, train_loss = 0.4096, train_accuracy = 0.2858, val_loss = 0.4262, val_accuracy = 0.2832
--------------------------------------------------




--------------------------------------------------
Epoch = 10/10, train_loss = 0.3417, train_accuracy = 0.3191, val_loss = 0.4809, val_accuracy = 0.2669
--------------------------------------------------




In [None]:
test_results(modelcnn, cnn_test_loader, "icd10code")

----------icd10code----------
{'micro/precision': 0.6130464480874317, 'micro/recall': 0.293925004093663, 'micro/f1': 0.3973436635307139, 'hammingloss': 0.18746126833298904}


In [None]:
modelcnn=CharacterLevelCNN(number_of_classes=10).to(device)

In [None]:
fit(10,modelcnn, cnn_train_loader, cnn_val_loader, 'icd10cat', opt_fn,loss_fn, learning_rate)

----------icd10cat----------


--------------------------------------------------
Epoch = 5/10, train_loss = 0.4312, train_accuracy = 0.2249, val_loss = 0.4641, val_accuracy = 0.2154
--------------------------------------------------




--------------------------------------------------
Epoch = 10/10, train_loss = 0.3759, train_accuracy = 0.2544, val_loss = 0.4763, val_accuracy = 0.2032
--------------------------------------------------




In [None]:
test_results(modelcnn, cnn_test_loader, "icd10cat")

----------icd10cat----------
{'micro/precision': 0.5789473684210527, 'micro/recall': 0.31150500088773153, 'micro/f1': 0.40506377820959266, 'hammingloss': 0.21292432692969773}


## Word Embedding

In [51]:
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')).union(set(string.punctuation)).union(set(list(range(10)))) # stopwords + punctuation+ numbers

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [52]:
def preprocessing(text):
  words=word_tokenize(text)
  filtered_sentence = [] 
  # remove stopwords
  for word in words: 
    if word not in stop_words: 
        filtered_sentence.append(word) 
  
  # lemmatize
  lemma_word = []
  wordnet_lemmatizer = WordNetLemmatizer()
  for w in filtered_sentence:
    word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
    word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
    word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
    lemma_word.append(word3)
  return lemma_word



In [53]:
discharge_diagnosis_icd = pd.read_csv("/content/drive/MyDrive/AlgoIntern/Data/Data2.0/discharge_diagnosis_icd")

In [54]:
len(discharge_diagnosis_icd)

48410

In [55]:
len(train_diagnosis),len(test_diagnosis)

(33887, 14523)

In [56]:
# sum(discharge_diagnosis_icd['discharge_diagnosis'].progress_apply(lambda x: preprocessing(x)).apply(lambda x:len(x)).to_list())/len(discharge_diagnosis_icd)

In [57]:
# max(discharge_diagnosis_icd['discharge_diagnosis'].progress_apply(lambda x: preprocessing(x)).apply(lambda x:len(x)).to_list())

In [58]:
counts = Counter()
for _, row in tqdm(discharge_diagnosis_icd.iterrows(),total=len(discharge_diagnosis_icd), position=0, leave=True):
  counts.update(preprocessing(row['discharge_diagnosis']))
    

100%|██████████| 48410/48410 [00:27<00:00, 1745.98it/s]


In [59]:
counts

Counter({'diabetic': 813,
         'keotacidosis': 1,
         'hematemesis': 81,
         'blood': 1670,
         'vomit': 61,
         'hypertension': 7229,
         'chronic': 7854,
         'renal': 6350,
         'insufficiency': 1136,
         'peptic': 239,
         'ulcer': 1355,
         'gi': 1574,
         'bleed': 3187,
         ',1.': 10978,
         'multiple': 1231,
         'myeloma.,2': 2,
         'congestive': 2988,
         'obstructive': 1952,
         'pulmonary': 3749,
         'disease': 11171,
         'exacerbation.,3': 28,
         'embolism.,4': 4,
         'hyponatremia': 503,
         'small': 1195,
         'bowel': 853,
         'obstruction': 603,
         'internal': 606,
         'hernia': 1054,
         'necrotic': 49,
         'jejunum': 13,
         'pneumonia': 4393,
         'coronary': 9435,
         'artery': 10941,
         'cad-': 6,
         'ami': 7,
         'UNK': 9589,
         'lcx': 77,
         'cardiomyopathy-': 1,
         'ef,35-45

In [60]:
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 37763
num_words after: 15622


In [61]:
vocab2index = {"":0, "UNKNOWN":1}
words = ["", "UNKNOWN"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [62]:
len(vocab2index)

15624

In [63]:
def encode_sentence(text, vocab2index, N = 50):
  tokenized = preprocessing(text)
  encoded = np.zeros(N, dtype=int)
  enc1 = np.array([vocab2index.get(word, vocab2index["UNKNOWN"]) for word in tokenized])
  length = min(N, len(enc1))
  encoded[:length] = enc1[:length]
  return encoded

In [64]:
# diagnosis_rnn=diagnosis[['discharge diagnosis:','ICD9_CATEGORY_list']]

In [65]:
class rnndataset(Dataset):
  def __init__(self, train_df, test_df, train = True):
    self.train_df = train_df
    self.test_df = test_df
    self.nsamples_train = len(train_df)
    self.nsamples_test = len(test_df)
    self.train = train

  def __getitem__(self,index):
    if self.train:
      x = torch.from_numpy(np.array(encode_sentence(self.train_df['discharge_diagnosis'].iloc[index],vocab2index)))
      y = {}
      y['icd9code'] = torch.from_numpy(labeltarget(self.train_df["ICD9_CODE"].iloc[index], frequent_icd9code))
      y['icd9cat'] = torch.from_numpy(labeltarget(self.train_df["ICD9_CATEGORY"].iloc[index], frequent_icd9category))
      y['icd10code'] = torch.from_numpy(labeltarget(self.train_df["ICD10"].iloc[index], frequent_icd10code))
      y['icd10cat'] = torch.from_numpy(labeltarget(self.train_df["ICD10_CATEGORY"].iloc[index], frequent_icd10category))
      return x, y
    
    x = torch.from_numpy(np.array(encode_sentence(self.test_df['discharge_diagnosis'].iloc[index],vocab2index)))
    y = {}
    y['icd9code'] = torch.from_numpy(labeltarget(self.test_df["ICD9_CODE"].iloc[index], frequent_icd9code))
    y['icd9cat'] = torch.from_numpy(labeltarget(self.test_df["ICD9_CATEGORY"].iloc[index], frequent_icd9category))
    y['icd10code'] = torch.from_numpy(labeltarget(self.test_df["ICD10"].iloc[index], frequent_icd10code))
    y['icd10cat'] = torch.from_numpy(labeltarget(self.test_df["ICD10_CATEGORY"].iloc[index], frequent_icd10category))
    return x, y

  def __len__(self):
    if self.train:
      return self.nsamples_train
    return self.nsamples_test

In [66]:
rnn_train_dataset = rnndataset(train_diagnosis, test_diagnosis, train = True)
rnn_test_dataset = rnndataset(train_diagnosis, test_diagnosis, train = False)

In [67]:
rnn_train_loader = DataLoader(rnn_train_dataset, batch_size = batch_size, sampler=train_sampler)
rnn_val_loader = DataLoader(rnn_train_dataset, batch_size = batch_size, sampler=val_sampler)


In [68]:
rnn_test_loader = DataLoader(rnn_test_dataset, batch_size = batch_size)

In [69]:
for x,_ in rnn_test_loader:
  print(x.shape)
  break

torch.Size([64, 50])


In [79]:
class RNNmodel(nn.Module):
  def __init__(self, vocab_size, embedding_size, hidden_size, num_layers, num_classes = 10):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
    self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True)
    #x-->(batch_size,seq,input_size)
    self.fc1 = nn.Sequential(nn.Linear(hidden_size,128),
                             nn.ReLU()
    )
    self.fc2 = nn.Linear(128,10)

    self.act=nn.Sigmoid()

  def forward(self,x):
    x = self.embeddings(x)
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    out,_=self.rnn(x,h0)
    #batch_size,seq_length,hidden_size
    out=out[:,-1,:]
    out = self.fc1(out)
    out = self.fc2(out)
    out=self.act(out)
    return out


In [80]:
class LSTMmodel(nn.Module) :
  def __init__(self, vocab_size, embedding_size, hidden_size, num_layers, num_classes = 10) :

    super().__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
    self.linear = nn.Linear(hidden_size, num_classes)
    self.act = nn.Sigmoid()
      
      
  def forward(self, x):
    x = self.embeddings(x)
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    
    lstm_out, (ht, ct) = self.lstm(x,(h0,c0))
    out = self.linear(lstm_out[:,-1,:])
    return self.act(out)

In [81]:
rnnmodel = RNNmodel(vocab_size = 15624, embedding_size = 50, hidden_size = 256, num_layers = 2).to(device)
lstmmodel = LSTMmodel(vocab_size = 15624, embedding_size = 50, hidden_size = 100, num_layers = 2).to(device)

In [82]:
summary(rnnmodel,(50,),dtypes = [torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
├─Embedding: 1-1                         [-1, 50, 50]              781,200
├─RNN: 1-2                               [-1, 50, 256]             210,432
├─Sequential: 1-3                        [-1, 128]                 --
|    └─Linear: 2-1                       [-1, 128]                 32,896
|    └─ReLU: 2-2                         [-1, 128]                 --
├─Linear: 1-4                            [-1, 10]                  1,290
├─Sigmoid: 1-5                           [-1, 10]                  --
Total params: 1,025,818
Trainable params: 1,025,818
Non-trainable params: 0
Total mult-adds (M): 1.06
Input size (MB): 0.00
Forward/backward pass size (MB): 0.12
Params size (MB): 3.91
Estimated Total Size (MB): 4.03


Layer (type:depth-idx)                   Output Shape              Param #
├─Embedding: 1-1                         [-1, 50, 50]              781,200
├─RNN: 1-2                               [-1, 50, 256]             210,432
├─Sequential: 1-3                        [-1, 128]                 --
|    └─Linear: 2-1                       [-1, 128]                 32,896
|    └─ReLU: 2-2                         [-1, 128]                 --
├─Linear: 1-4                            [-1, 10]                  1,290
├─Sigmoid: 1-5                           [-1, 10]                  --
Total params: 1,025,818
Trainable params: 1,025,818
Non-trainable params: 0
Total mult-adds (M): 1.06
Input size (MB): 0.00
Forward/backward pass size (MB): 0.12
Params size (MB): 3.91
Estimated Total Size (MB): 4.03

In [83]:
fit(50, rnnmodel, rnn_train_loader, rnn_val_loader, 'icd9code', opt_fn, loss_fn, learning_rate)

----------icd9code----------


--------------------------------------------------
Epoch = 1/50, train_loss = 0.4974, train_accuracy = 0.2231, val_loss = 0.4873, val_accuracy = 0.2207
--------------------------------------------------




--------------------------------------------------
Epoch = 2/50, train_loss = 0.4843, train_accuracy = 0.2304, val_loss = 0.4871, val_accuracy = 0.2204
--------------------------------------------------




--------------------------------------------------
Epoch = 3/50, train_loss = 0.4842, train_accuracy = 0.2301, val_loss = 0.4875, val_accuracy = 0.2204
--------------------------------------------------




--------------------------------------------------
Epoch = 4/50, train_loss = 0.4841, train_accuracy = 0.2301, val_loss = 0.4867, val_accuracy = 0.2202
--------------------------------------------------




--------------------------------------------------
Epoch = 5/50, train_loss = 0.4837, train_accuracy = 0.2288, val_loss = 0.4840, val_accurac

In [84]:
test_results(rnnmodel, rnn_test_loader, "icd9code")

----------icd9code----------
{'micro/precision': 0.6451300795527843, 'micro/recall': 0.2055066607307969, 'micro/f1': 0.3117159701841415, 'hammingloss': 0.18247607243682434}


In [85]:
fit(50, lstmmodel, rnn_train_loader, rnn_val_loader, 'icd9code', opt_fn, loss_fn, learning_rate)

----------icd9code----------


--------------------------------------------------
Epoch = 1/50, train_loss = 0.5243, train_accuracy = 0.1959, val_loss = 0.4857, val_accuracy = 0.2204
--------------------------------------------------




--------------------------------------------------
Epoch = 2/50, train_loss = 0.4748, train_accuracy = 0.2303, val_loss = 0.4679, val_accuracy = 0.2202
--------------------------------------------------




--------------------------------------------------
Epoch = 3/50, train_loss = 0.4607, train_accuracy = 0.2300, val_loss = 0.4602, val_accuracy = 0.2212
--------------------------------------------------




--------------------------------------------------
Epoch = 4/50, train_loss = 0.4542, train_accuracy = 0.2302, val_loss = 0.4588, val_accuracy = 0.2212
--------------------------------------------------




--------------------------------------------------
Epoch = 5/50, train_loss = 0.4492, train_accuracy = 0.2309, val_loss = 0.4519, val_accurac

In [86]:
test_results(lstmmodel, rnn_test_loader, "icd9code")

----------icd9code----------
{'micro/precision': 0.6555145362251962, 'micro/recall': 0.291873565973768, 'micro/f1': 0.4039049356680805, 'hammingloss': 0.1732217861323418}


In [None]:
# diagnosis_rnn

In [None]:
# diagnosis_rnn['target']=diagnosis_rnn['ICD9_CATEGORY_list'].apply(labeltarget)

In [None]:
# diagnosis_rnn

In [None]:
# total_tokens=diagnosis_rnn['tokenized'].to_list()

NameError: ignored

In [88]:
sent_tokens = discharge_diagnosis_icd['discharge_diagnosis'].apply(lambda x: preprocessing(x)).to_list()

In [89]:
sent_tokens[:2]

[['diabetic',
  'keotacidosis',
  'hematemesis',
  'blood',
  'vomit',
  'hypertension',
  'chronic',
  'renal',
  'insufficiency'],
 ['peptic', 'ulcer', 'gi', 'bleed']]

In [90]:
from gensim.models import Word2Vec

In [91]:
w2vmodel = Word2Vec(sent_tokens, min_count=1)

In [92]:
# summarize model
print(w2vmodel)

# summarize vocabulary
words = list(w2vmodel.wv.vocab)
print(words)


Word2Vec(vocab=37763, size=100, alpha=0.025)


In [93]:
w2vmodel.wv.most_similar('hypertension')[:5]

[(',hypertension', 0.8295971751213074),
 ('hypothyroidism', 0.7940075397491455),
 ('htn', 0.7615104913711548),
 ('gout', 0.7277899980545044),
 ('mellitus', 0.7221678495407104)]

In [94]:
def get_emb_matrix(word_vecs, word_counts, emb_size = 100):
  """ Creates embedding matrix from word vectors"""
  vocab_size = len(word_counts) + 2
  vocab_to_idx = {}
  vocab = ["", "UNKNOWN"]
  W = np.zeros((vocab_size, emb_size), dtype="float32")
  W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
  W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
  vocab_to_idx["UNKNOWN"] = 1
  i = 2
  for word in word_counts:
    if word in word_vecs:
      W[i] = word_vecs[word]
    else:
      W[i] = np.random.uniform(-0.25,0.25, emb_size)
    vocab_to_idx[word] = i
    vocab.append(word)
    i += 1   
  return W, np.array(vocab), vocab_to_idx

In [95]:
W,_,_ = get_emb_matrix(w2vmodel, counts)

  if sys.path[0] == '':
  del sys.path[0]


In [96]:
def create_emb_layer(weights_matrix, non_trainable=False):
  num_embeddings, embedding_dim = weights_matrix.shape
  emb_layer = nn.Embedding(num_embeddings, embedding_dim, padding_idx = 0)
  emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
  if non_trainable:
    emb_layer.weight.requires_grad = False

  return emb_layer, num_embeddings, embedding_dim

In [97]:
class LSTMw2vmodel(nn.Module) :
  def __init__(self, weights_matrix, hidden_size, num_layers, num_classes = 10) :

    super().__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.embeddings, num_embeddings, embedding_size = create_emb_layer(weights_matrix, True)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
    self.linear = nn.Linear(hidden_size, num_classes)
    self.act = nn.Sigmoid()
      
      
  def forward(self, x):     
    x = self.embeddings(x)
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    lstm_out, (ht, ct) = self.lstm(x, (h0,c0))
    out = self.linear(lstm_out[:,-1,:])
    return self.act(out)

In [98]:
lstmw2vmodel = LSTMw2vmodel(weights_matrix = W, hidden_size = 100, num_layers = 2).to(device)

In [99]:
fit(50, lstmw2vmodel, rnn_train_loader, rnn_val_loader, 'icd9code', opt_fn, loss_fn, learning_rate)

----------icd9code----------


--------------------------------------------------
Epoch = 1/50, train_loss = 0.5288, train_accuracy = 0.1894, val_loss = 0.4835, val_accuracy = 0.2209
--------------------------------------------------




--------------------------------------------------
Epoch = 2/50, train_loss = 0.4620, train_accuracy = 0.2307, val_loss = 0.4597, val_accuracy = 0.2209
--------------------------------------------------




--------------------------------------------------
Epoch = 3/50, train_loss = 0.4465, train_accuracy = 0.2326, val_loss = 0.4467, val_accuracy = 0.2247
--------------------------------------------------




--------------------------------------------------
Epoch = 4/50, train_loss = 0.4411, train_accuracy = 0.2350, val_loss = 0.4424, val_accuracy = 0.2275
--------------------------------------------------




--------------------------------------------------
Epoch = 5/50, train_loss = 0.4362, train_accuracy = 0.2381, val_loss = 0.4364, val_accurac

In [100]:
test_results(lstmw2vmodel, rnn_test_loader, "icd9code")

----------icd9code----------
{'micro/precision': 0.7110295674264343, 'micro/recall': 0.34505667614122804, 'micro/f1': 0.464631559531495, 'hammingloss': 0.15988432142119396}


# Hybrid Model

In [113]:
class hybriddataset(Dataset):
  def __init__(self, train_df, test_df, train = True):
    self.train_df = train_df
    self.test_df = test_df
    self.nsamples_train = len(train_df)
    self.nsamples_test = len(test_df)
    self.train = train

  def __getitem__(self,index):
    if self.train:
      rnn_x = torch.from_numpy(np.array(encode_sentence(self.train_df['discharge_diagnosis'].iloc[index],vocab2index)))
      cnn_x = torch.from_numpy(character_embedding(character_index(self.train_df['discharge_diagnosis'].iloc[index], vocabulary, 140)))
      y = {}
      y['icd9code'] = torch.from_numpy(labeltarget(self.train_df["ICD9_CODE"].iloc[index], frequent_icd9code))
      y['icd9cat'] = torch.from_numpy(labeltarget(self.train_df["ICD9_CATEGORY"].iloc[index], frequent_icd9category))
      y['icd10code'] = torch.from_numpy(labeltarget(self.train_df["ICD10"].iloc[index], frequent_icd10code))
      y['icd10cat'] = torch.from_numpy(labeltarget(self.train_df["ICD10_CATEGORY"].iloc[index], frequent_icd10category))
      return rnn_x,cnn_x, y
    
    rnn_x = torch.from_numpy(np.array(encode_sentence(self.test_df['discharge_diagnosis'].iloc[index],vocab2index)))
    cnn_x = torch.from_numpy(character_embedding(character_index(self.test_df['discharge_diagnosis'].iloc[index], vocabulary, 140)))
    y = {}
    y['icd9code'] = torch.from_numpy(labeltarget(self.test_df["ICD9_CODE"].iloc[index], frequent_icd9code))
    y['icd9cat'] = torch.from_numpy(labeltarget(self.test_df["ICD9_CATEGORY"].iloc[index], frequent_icd9category))
    y['icd10code'] = torch.from_numpy(labeltarget(self.test_df["ICD10"].iloc[index], frequent_icd10code))
    y['icd10cat'] = torch.from_numpy(labeltarget(self.test_df["ICD10_CATEGORY"].iloc[index], frequent_icd10category))
    return rnn_x, cnn_x, y

  def __len__(self):
    if self.train:
      return self.nsamples_train
    return self.nsamples_test

In [114]:
hybrid_train_dataset = hybriddataset(train_diagnosis, test_diagnosis, train = True)
hybrid_test_dataset = hybriddataset(train_diagnosis, test_diagnosis, train = False)

In [115]:
hybrid_train_loader = DataLoader(hybrid_train_dataset, batch_size = batch_size, sampler=train_sampler)
hybrid_val_loader = DataLoader(hybrid_train_dataset, batch_size = batch_size, sampler=val_sampler)

In [116]:
hybrid_test_loader = DataLoader(hybrid_test_dataset, batch_size = batch_size)

In [117]:
class hybrid(nn.Module):
  def __init__(self, rnnmodel, cnnmodel, weights_matrix, hidden_size, num_layers=2, num_classes=10):
    super().__init__()

    self.rnnmodel = rnnmodel(weights_matrix, hidden_size, num_classes).to(device)
    self.cnnmodel = cnnmodel(num_classes).to(device)

    self.fc = nn.Linear(20,num_classes)

    self.act = nn.Sigmoid()
  
  def forward(self,rnninput,cnninput):
    rnn_out = self.rnnmodel(rnninput)
    cnn_out = self.cnnmodel(cnninput)
    x = torch.cat((rnn_out,cnn_out),dim=1)
    out = self.fc(x)
    out = self.act(out)
    return out


In [118]:
summary(hybrid, (50,), (69,140))

AttributeError: ignored

In [119]:
hybridmodel = hybrid(LSTMw2vmodel, CharacterLevelCNN, W, 100).to(device)

In [120]:
def fit(epochs, model, hybrid_train_loader, hybrid_val_loader, icdtype, opt_fn,loss_fn, learning_rate):
  optimizer = opt_fn(model.parameters(), lr=learning_rate)
  print('-'*10 + icdtype + '-'*10)
  for epoch in range(1,epochs+1):

    model.train()

    train_epoch_loss=0
    train_epoch_accuracy=0

    val_epoch_loss=0
    val_epoch_accuracy=0

    
    for rnn_x, cnn_x, y_dict in hybrid_train_loader:

      rnn_x = rnn_x.to(device)
      cnn_x = cnn_x.to(device)

      y = y_dict[icdtype]
      y = y.to(device)
      

      
      preds=model(rnn_x, cnn_x)

      optimizer.zero_grad()
      loss=loss_fn(preds,y)
      loss.backward()
      optimizer.step()
      
      accuracy = multi_acc(preds,y)

      train_epoch_loss+=loss.item()
      train_epoch_accuracy+=accuracy.item()
    
    model.eval()
    with torch.no_grad():
      for rnn_x, cnn_x, y_dict in hybrid_val_loader:
        
        rnn_x = rnn_x.to(device)
        cnn_x = cnn_x.to(device)

        y = y_dict[icdtype]
        y = y.to(device)
        
        preds=model(rnn_x, cnn_x)

        loss=loss_fn(preds,y)
        accuracy = multi_acc(preds,y)

        val_epoch_loss+=loss.item()
        val_epoch_accuracy+=accuracy.item()

    
    
    print("\n")
    print('-'*50)
    print('Epoch = {}/{}, train_loss = {:.4f}, train_accuracy = {:.4f}, val_loss = {:.4f}, val_accuracy = {:.4f}'.format(epoch
                                                              ,epochs
                                                              ,train_epoch_loss/len(hybrid_train_loader)
                                                              ,train_epoch_accuracy/len(hybrid_train_loader)
                                                              ,val_epoch_loss/len(hybrid_val_loader)
                                                              ,val_epoch_accuracy/len(hybrid_val_loader)
                                                              ))
    print('-'*50)
    print("\n")
    

In [121]:
def test_results(model, hybrid_test_loader, icdtype):

  model.eval()
  with torch.no_grad():
    model_result = []
    targets = []
    for rnn_x, cnn_x, batch_targets in hybrid_test_loader:
      rnn_x = rnn_x.to(device)
      cnn_x = cnn_x.to(device)

      model_batch_result = model(rnn_x, cnn_x)
      model_result.extend(model_batch_result.cpu().numpy())
      targets.extend(batch_targets[icdtype].cpu().numpy())

  result = calculate_metrics(np.array(model_result), np.array(targets))
  print('-'*10 + icdtype + '-'*10)
  print(result)

In [124]:
fit(50, hybridmodel, hybrid_train_loader, hybrid_val_loader, "icd9code", opt_fn, loss_fn, learning_rate)

----------icd9code----------


--------------------------------------------------
Epoch = 1/50, train_loss = 0.3649, train_accuracy = 0.2645, val_loss = 0.4568, val_accuracy = 0.2222
--------------------------------------------------




--------------------------------------------------
Epoch = 2/50, train_loss = 0.3639, train_accuracy = 0.2660, val_loss = 0.4510, val_accuracy = 0.2163
--------------------------------------------------




--------------------------------------------------
Epoch = 3/50, train_loss = 0.3623, train_accuracy = 0.2664, val_loss = 0.4509, val_accuracy = 0.2107
--------------------------------------------------




--------------------------------------------------
Epoch = 4/50, train_loss = 0.3606, train_accuracy = 0.2690, val_loss = 0.4499, val_accuracy = 0.2142
--------------------------------------------------




--------------------------------------------------
Epoch = 5/50, train_loss = 0.3601, train_accuracy = 0.2704, val_loss = 0.4510, val_accurac

KeyboardInterrupt: ignored

In [125]:
test_results(hybridmodel, hybrid_test_loader, 'icd9code')

----------icd9code----------
{'micro/precision': 0.56279538723297, 'micro/recall': 0.3058456902160885, 'micro/f1': 0.39631684047037935, 'hammingloss': 0.18734421262824485}
