In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2
Looking in in

In [2]:
import os
import re
import string

import numpy as np
import pandas as pd
from sklearn import metrics

import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize, regexp_tokenize

import tensorflow as tf




In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [70]:
dfbert = pd.read_csv('/content/drive/MyDrive/df_bert.csv').reset_index(drop=True).rename(columns={'discharge_instruction': 'Text'}).dropna()
dfbert.head()
dfbert.shape  #(263941, 12 without dropna() and 261639 after dropna() lost 2302 rows)

(261639, 12)

In [71]:
dfbert = dfbert.sample(10000)
dfbert.shape

(10000, 12)

In [72]:
df_train, df_dev = train_test_split(dfbert, random_state=42, test_size=0.33, shuffle=True)

In [73]:
print('train has (columns, variables);', df_train.shape, 'and test has (columns, variables):', df_dev.shape)
print('train head;', df_train.head(5), '\n -----------------\n test head', df_dev.head(5))

train has (columns, variables); (6700, 12) and test has (columns, variables): (3300, 12)
train head;         subject_id   hadm_id  \
155098    16889089  25347452   
29674     15275541  27660589   
243207    17083592  20333067   
5478      11021643  21279207   
15426     12717676  21112839   

                                                     Text  icd_E11  icd_E78  \
155098                                              none         0        1   
29674    you were sent to the hospital from clinic bec...        0        0   
243207  dear you were admitted to the acute care surge...        0        0   
5478    dear  it was a pleasure taking care of you at ...        1        1   
15426   you were transfered to  for cardiac catheteriz...        1        0   

        icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  icd_Z85  
155098        0        0        0        0        0        1        0  
29674         0        0        0        1        0        0        0  
243207        0

In [74]:
df_train[df_train['Text']=='']

Unnamed: 0,subject_id,hadm_id,Text,icd_E11,icd_E78,icd_E87,icd_F32,icd_I16,icd_I50,icd_N17,icd_Y92,icd_Z85


In [75]:
print(df_train.shape, df_train.columns)
print(df_dev.shape)

(6700, 12) Index(['subject_id', 'hadm_id', 'Text', 'icd_E11', 'icd_E78', 'icd_E87',
       'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object')
(3300, 12)


In [76]:
df_train = df_train.rename(columns={'discharge_instruction': 'Text'}).reset_index(drop=True).dropna()
df_dev = df_dev.rename(columns={'discharge_instruction': 'Text'}).reset_index(drop=True).dropna()

In [77]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [78]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [79]:
target_cols = [col for col in df_train.columns if col not in ['subject_id', 'hadm_id', 'Text']]
target_cols

['icd_E11',
 'icd_E78',
 'icd_E87',
 'icd_F32',
 'icd_I16',
 'icd_I50',
 'icd_N17',
 'icd_Y92',
 'icd_Z85']

In [80]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.Text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [81]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_dev, tokenizer, MAX_LEN)

In [82]:
train_dataset.text.loc[3000]

'division of vascular and endovascular surgerycarotid endarterectomy surgery discharge instructions'

In [83]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [84]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768, len(target_cols))
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device);

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [85]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [86]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

In [87]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)
        print('made it here')

        loss = loss_fn(outputs, targets)
        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [88]:
for epoch in range(EPOCHS):
    train(epoch)

made it here
Epoch: 0, Loss:  0.7133865356445312
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
made it here
ma

In [49]:
# import gc
# # model.cpu()
# #del model
# gc.collect()
# torch.cuda.empty_cache()

In [89]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [90]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.0693939393939394
F1 Score (Micro) = 0.4169666614493661
F1 Score (Macro) = 0.3692548966992083


In [91]:
outputs.astype(int)

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [92]:
valid_dataset.targets

array([[1, 0, 0, ..., 1, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 1, ..., 1, 1, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [93]:
y_true = valid_dataset.targets
y_pred = outputs.astype(int)

In [94]:
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])
    
# print('Hamming_loss with the discharge_instruction as variable is', Hamming_Loss(y_true, y_pred))

In [95]:
print('Hamming_loss with the discharge_instruction as variable is 10000 observation:', Hamming_Loss(y_true, y_pred))

Hamming_loss with the discharge_instruction as variable is 10000 observation: 0.25084175084175087


In [None]:
torch.save(model.state_dict(), 'model.bin')