<a href="https://colab.research.google.com/github/sradical/ClinicalBert-Entity-Extraction/blob/main/ClinicalBertTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import sys
env_dir = '/content/drive/MyDrive/env_nlp'
sys.path.append(env_dir)

### **Change runtime to GPU PRIOR to executing this code**


In [3]:
import os
import sys
from getpass import getpass
import urllib
import joblib
from tqdm import tqdm

In [4]:
# Load Training dependencies
import torch
from torch.optim import AdamW
import numpy as np
from sklearn import model_selection
from transformers import get_linear_schedule_with_warmup

In [5]:
# check if CUDA is available
use_cuda = torch.cuda.is_available()
print(use_cuda)

if use_cuda:
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print(f"device: {device}")

True
device: cuda


In [None]:
# 
#!pip install --target=$env_dir spacy

In [6]:
user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
repo_name = input('Repo name: ')

cmd_string = 'git clone https://{0}:{1}@github.com/{0}/{2}.git'.format(user, password, repo_name)

User name: sradical
Password: ··········
Repo name: ClinicalBert-Entity-Extraction


In [7]:
os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

In [8]:
# CHANGE directory to repository 
% cd ClinicalBert-Entity-Extraction

/content/ClinicalBert-Entity-Extraction


In [9]:
sys.path.insert(0, 'ClinicalBert-Entity-Extraction/src/')
sys.path.insert(0, 'ClinicalBert-Entity-Extraction/input/')

In [10]:
% cd src

/content/ClinicalBert-Entity-Extraction/src


In [11]:
import config
import process_input
import dataset
import model
import engine

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

In [12]:
print(config.NCBI_TRAINING_FILE)

../input/NCBITraining_tagged.pkl


In [13]:
sentences, pos, tag, enc_pos, enc_tag = process_input.inputdata(config.NCBI_TRAINING_FILE)

In [14]:
(train_sentences, 
 test_sentences, 
 train_pos, 
 test_pos, 
 train_tag, 
 test_tag ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

In [15]:
train_dataset = dataset.EntityDataset(train_sentences, train_pos, train_tag)
valid_dataset = dataset.EntityDataset(test_sentences, test_pos, test_tag)

In [16]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=0)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=0)

In [None]:
#for (idx, data) in enumerate(train_loader):
#  for k, v in data.items():
#    data[k] = [t.to(device) for t in v]
#    pass

In [18]:
num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))

In [19]:
model = model.EntityModel(num_pos=num_pos, num_tag=num_tag)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
model = model.to(device)

In [21]:
training_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
print("Number of training steps {}".format(training_steps))

Number of training steps 869


In [22]:
param_optimizer = list(model.named_parameters())

In [23]:
# Parameters EXCLUDED from optimization
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
     {
        "params" : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay" : 0.001,
     }, 
     {
        "params" : [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay" : 0.0,
     },
]

In [24]:
optimizer = AdamW(optimizer_parameters, lr = 3e-5)

In [25]:
scheduler = get_linear_schedule_with_warmup(optimizer = optimizer, num_warmup_steps=0, num_training_steps=training_steps)

In [27]:
best_loss = np.inf
for epoch in range(config.EPOCHS):
  train_loss = engine.train_fn(train_loader, model, optimizer, device, scheduler)
  valid_loss = engine.eval_fn(valid_loader, model, device)
  print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")
  if valid_loss < best_loss:
    torch.save(model.state_dict, config.MODEL_PATH)
    best_loss = valid_loss    

100%|██████████| 87/87 [00:47<00:00,  1.83it/s]
