In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
% cd /content/drive/MyDrive

/content/drive/MyDrive


In [4]:
%pwd

'/content/drive/MyDrive'

In [5]:
import sys
env_dir = '/content/drive/MyDrive/env_nlp'
sys.path.append(env_dir)

### **Change runtime to GPU PRIOR to executing this code**


In [6]:
import os
import sys
from getpass import getpass
import urllib
import joblib
from tqdm import tqdm

In [7]:
# Load Training dependencies
import torch
from torch.optim import AdamW
import numpy as np
from sklearn import model_selection
from transformers import get_linear_schedule_with_warmup

In [8]:
# check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [None]:
# 
#!pip install --target=$env_dir spacy

In [9]:
user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
repo_name = input('Repo name: ')

cmd_string = 'git clone https://{0}:{1}@github.com/{0}/{2}.git'.format(user, password, repo_name)

User name: sradical
Password: ··········
Repo name: ClinicalBert-Entity-Extraction


In [10]:
os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

In [11]:
# CHANGE directory to repository 
% cd ClinicalBert-Entity-Extraction

/content/drive/MyDrive/ClinicalBert-Entity-Extraction


In [12]:
sys.path.insert(0, 'ClinicalBert-Entity-Extraction/src/')
sys.path.insert(0, 'ClinicalBert-Entity-Extraction/input/')

In [13]:
% cd src

/content/drive/MyDrive/ClinicalBert-Entity-Extraction/src


In [15]:
import config
import process_input
import dataset
import model
import engine

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

In [16]:
print(config.NCBI_TRAINING_FILE)

../input/NCBITraining_tagged.pkl


In [17]:
sentences, pos, tag, enc_pos, enc_tag = process_input.inputdata(config.NCBI_TRAINING_FILE)

In [18]:
(train_sentences, 
 test_sentences, 
 train_pos, 
 test_pos, 
 train_tag, 
 test_tag ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

In [19]:
train_dataset = dataset.EntityDataset(train_sentences, train_pos, train_tag)
valid_dataset = dataset.EntityDataset(test_sentences, test_pos, test_tag)

In [20]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=0)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=0)

In [21]:
num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))

In [22]:
model = model.EntityModel(num_pos=num_pos, num_tag=num_tag)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
model = model.to(device)

In [24]:
training_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
print("Number of training steps {}".format(training_steps))

Number of training steps 869


In [25]:
param_optimizer = list(model.named_parameters())

In [26]:
# Parameters EXCLUDED from optimization
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
     {
        "params" : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay" : 0.001,
     }, 
     {
        "params" : [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay" : 0.0,
     },
]

In [27]:
optimizer = AdamW(optimizer_parameters, lr = 3e-5)

In [28]:
scheduler = get_linear_schedule_with_warmup(optimizer = optimizer, num_warmup_steps=0, num_training_steps=training_steps)

In [30]:
best_loss = np.inf
for epoch in range(config.EPOCHS):
  train_loss = engine.train_fn(train_loader, model, optimizer, device, scheduler)
  valid_loss = engine.eval_fn(valid_loader, model, device)
  print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")
  if valid_loss < best_loss:
    torch.save(model.state_dict(), config.MODEL_PATH)
    best_loss = valid_loss    

100%|██████████| 87/87 [02:41<00:00,  1.86s/it]
100%|██████████| 39/39 [00:09<00:00,  3.92it/s]


Train Loss = 0.7520667407019385 Valid Loss = 0.24602713072911286


100%|██████████| 87/87 [02:42<00:00,  1.87s/it]
100%|██████████| 39/39 [00:10<00:00,  3.89it/s]


Train Loss = 0.2300097483328019 Valid Loss = 0.18068350985264167


100%|██████████| 87/87 [02:24<00:00,  1.66s/it]
100%|██████████| 39/39 [00:05<00:00,  7.00it/s]


Train Loss = 0.17479681078044848 Valid Loss = 0.16446324705313414


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  7.02it/s]


Train Loss = 0.14371045747365074 Valid Loss = 0.15946985590152252


100%|██████████| 87/87 [02:02<00:00,  1.40s/it]
100%|██████████| 39/39 [00:05<00:00,  7.03it/s]


Train Loss = 0.12224887422789102 Valid Loss = 0.16263672633048815


100%|██████████| 87/87 [02:01<00:00,  1.40s/it]
100%|██████████| 39/39 [00:05<00:00,  7.01it/s]


Train Loss = 0.10746322713535407 Valid Loss = 0.15312308445572853


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  7.00it/s]


Train Loss = 0.09496827753282142 Valid Loss = 0.15818100307996458


100%|██████████| 87/87 [02:01<00:00,  1.40s/it]
100%|██████████| 39/39 [00:05<00:00,  7.01it/s]


Train Loss = 0.08406302718252971 Valid Loss = 0.15320741901030907


100%|██████████| 87/87 [02:01<00:00,  1.40s/it]
100%|██████████| 39/39 [00:05<00:00,  7.01it/s]


Train Loss = 0.07742918800862356 Valid Loss = 0.15270678288279435


100%|██████████| 87/87 [02:01<00:00,  1.40s/it]
100%|██████████| 39/39 [00:05<00:00,  7.03it/s]

Train Loss = 0.07187090775576131 Valid Loss = 0.15320667662681678





In [31]:
% ls

config.py   engine.py  model.bin  predict.py        [0m[01;34m__pycache__[0m/
dataset.py  meta.bin   model.py   process_input.py  train.py


**Predict**

In [52]:
meta_data = joblib.load("meta.bin")
enc_pos = meta_data["enc_pos"]
enc_tag = meta_data["enc_tag"]

In [54]:
num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))

In [55]:
sentence = """The risk of cancer, especially lymphoid neoplasias is elevated. """
print(sentence)

The risk of cancer, especially lymphoid neoplasias is elevated. 


In [56]:
tokenized_sentence = config.TOKENIZER(sentence)
print(tokenized_sentence)

{'input_ids': [101, 1103, 3187, 1104, 4182, 117, 2108, 181, 25698, 7874, 15242, 1643, 22992, 1116, 1110, 8208, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [57]:
test_dataset = dataset.EntityDataset(
    texts = [sentence], 
    tag = [[0] * len(sentence)],
    pos = [[0] * len(sentence)]
)

In [70]:
import model
model = model.EntityModel(num_tag=num_tag, num_pos=num_pos)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [71]:
model.load_state_dict(torch.load(config.MODEL_PATH))

<All keys matched successfully>

In [72]:
model.to(device)

EntityModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [73]:
with torch.no_grad():
  data = test_dataset[0]
  for k, v in data.items():
    data[k] = v.to(device).unsqueeze(0)
  tag, pos, _ = model(**data)

In [74]:
print(   
    enc_tag.inverse_transform(
    tag.argmax(2).cpu().numpy().reshape(-1)
    )[:len(tokenized_sentence)]
 )
print(
    enc_pos.inverse_transform(
    pos.argmax(2).cpu().numpy().reshape(-1)
  )[:len(tokenized_sentence)]
)

['-' 'O' 'O']
['ADJ' 'PROPN' 'PROPN']
