In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m99.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [2]:
import os
import re
import string

import numpy as np
import pandas as pd
from sklearn import metrics

import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize, regexp_tokenize

import tensorflow as tf




In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dfbert = pd.read_csv('/content/drive/MyDrive/df_bert.csv').reset_index(drop=True).rename(columns={'discharge_instruction': 'Text'}).dropna()
dfbert.head()
dfbert.shape  #(263941, 12 without dropna() and 261639 after dropna() lost 2302 rows)

(261639, 12)

In [5]:
dfbert = dfbert.sample(25000, random_state=42) #we added the random_state so that we can compare the models across different logics
dfbert.shape

(25000, 12)

In [6]:
df_train, df_dev = train_test_split(dfbert, random_state=42, test_size=0.33, shuffle=True)

In [7]:
print('train has (columns, variables);', df_train.shape, 'and test has (columns, variables):', df_dev.shape)
print('train head;', df_train.head(5), '\n -----------------\n test head', df_dev.head(5))

train has (columns, variables); (16750, 12) and test has (columns, variables): (8250, 12)
train head;         subject_id   hadm_id  \
161089    17887416  26347194   
218175    13577943  29183660   
148475    15718099  27065743   
40323     17145096  22472768   
192795    11907503  24738613   

                                                     Text  icd_E11  icd_E78  \
161089  you were admitted after having a cardiac cathe...        0        1   
218175  you have undergone the following operation ant...        0        0   
148475  you came to the hospital after staff at  were ...        0        1   
40323    you were admitted to  with constipation and d...        1        1   
192795  dear    it was a privilege caring for you at  ...        0        0   

        icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  icd_Z85  
161089        0        0        0        0        0        0        0  
218175        0        0        1        0        0        0        0  
148475        

In [None]:
# df_train[df_train['Text']=='']

In [26]:
print(df_train.shape, df_train.columns)
print(df_dev.shape)

(16750, 12) Index(['subject_id', 'hadm_id', 'Text', 'icd_E11', 'icd_E78', 'icd_E87',
       'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object')
(8250, 12)


In [27]:
df_train = df_train.rename(columns={'discharge_instruction': 'Text'}).reset_index(drop=True).dropna()
df_dev = df_dev.rename(columns={'discharge_instruction': 'Text'}).reset_index(drop=True).dropna()

In [28]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [29]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
# tokenizer = AutoTokenizer.from_pretrained('roberta-base') 
# tokenizer = AutoTokenizer.from_pretrained('bert-based-uncased') 
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
#bert-base-uncased roberta-base
#change in the mdoel also

In [30]:
target_cols = [col for col in df_train.columns if col not in ['subject_id', 'hadm_id', 'Text']]
target_cols

['icd_E11',
 'icd_E78',
 'icd_E87',
 'icd_F32',
 'icd_I16',
 'icd_I50',
 'icd_N17',
 'icd_Y92',
 'icd_Z85']

In [31]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.Text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [32]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_dev, tokenizer, MAX_LEN)

In [33]:
train_dataset.text.loc[3000]

'dear  thank you for choosing  your site of carewhy was i admitted to the hospitalyou were admitted because you had elevated liver values what was done for me while i was in the hospitalyour liver values and mental status was monitored throughout your hospitalization your mental status remained unchanged and your liver values improvedyou had a liver biopsy which demonstrated that it was likely your metformin that caused the liver injuryyou had a headache imaging of your head did not demonstrate any bleeding in your head it was suspected that the headache was from poor sleep and neck tightness and it improved with time and fluidsyour blood counts decreased after your liver biopsy you were monitored for bleeding and none was foundyour kidney value increased likely due to dehydration you were given fluids through your iv with improvement in kidney value what should i do when i leave the hospitalplease keep well hydrated by taking in good amounts of water and broth avoid sugary juices for 

In [34]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [35]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        # self.roberta = AutoModel.from_pretrained('roberta-base')
        # elf.bertbase = AutoModel.from_pretrained('bert-base-uncased')
        self.clnbert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768, len(target_cols))
    
    def forward(self, ids, mask, token_type_ids):
        # _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        # _, features = self.bertbase(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        _, features = self.clnbert(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device);

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [37]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

In [38]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)
        print('made it here')

        loss = loss_fn(outputs, targets)
        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [39]:
for epoch in range(EPOCHS):
    train(epoch)

made it here
Epoch: 0, Loss:  0.7024855017662048
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
ma

In [25]:
# import gc
# # # model.cpu()
# del model
# gc.collect()
# torch.cuda.empty_cache()

In [40]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [41]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.08375757575757575
F1 Score (Micro) = 0.42436028027286476
F1 Score (Macro) = 0.39220560706440794


In [42]:
outputs.astype(int)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [43]:
valid_dataset.targets

array([[1, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 0, 1, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [44]:
y_true = valid_dataset.targets
y_pred = outputs.astype(int)

In [45]:
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])
    
# print('Hamming_loss with the discharge_instruction as variable is', Hamming_Loss(y_true, y_pred))

In [47]:
print('Hamming_loss with the discharge_instruction as variable is 25000 observation:', Hamming_Loss(y_true, y_pred))

Hamming_loss with the discharge_instruction as variable is 25000 observation: 0.25116498316498315


In [48]:
print(target_cols, (abs(np.diff([y_true, y_pred], axis=0)).sum(axis=1)/len(y_true))*100)

['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92', 'icd_Z85'] [[31.96363636 41.32121212 21.75757576 24.94545455 31.13939394 16.12121212
  18.19393939 19.44242424 21.16363636]]


In [49]:
100-(abs(np.diff([y_true, y_pred], axis=0)).sum(axis=1)/len(y_true))*100

array([[68.03636364, 58.67878788, 78.24242424, 75.05454545, 68.86060606,
        83.87878788, 81.80606061, 80.55757576, 78.83636364]])

In [None]:
torch.save(model.state_dict(), 'model.bin')