In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
% cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
%pwd

'/content/drive/MyDrive'

In [None]:
import sys
env_dir = '/content/drive/MyDrive/env_nlp'
sys.path.append(env_dir)

### **Change runtime to GPU PRIOR to executing this code**


In [None]:
import os
import sys
from getpass import getpass
import urllib
import joblib
from tqdm import tqdm

In [None]:
# Load Training dependencies
import torch
from torch.optim import AdamW
import numpy as np
from sklearn import model_selection
from transformers import get_linear_schedule_with_warmup

In [None]:
# check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device} map_location: {map_location}")

device: cuda map_location: cuda


In [None]:
# 
#!pip install --target=$env_dir spacy

In [None]:
user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
repo_name = input('Repo name: ')

cmd_string = 'git clone https://{0}:{1}@github.com/{0}/{2}.git'.format(user, password, repo_name)

User name: sradical
Password: ··········
Repo name: ClinicalBert-Entity-Extraction


In [None]:
os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

In [None]:
# CHANGE directory to repository 
% cd ClinicalBert-Entity-Extraction

/content/drive/MyDrive/ClinicalBert-Entity-Extraction


In [None]:
sys.path.insert(0, 'ClinicalBert-Entity-Extraction/src/')
sys.path.insert(0, 'ClinicalBert-Entity-Extraction/input/')

In [None]:
% cd src

/content/drive/MyDrive/ClinicalBert-Entity-Extraction/src


In [None]:
import config
import process_input
import dataset
import model
import engine

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

In [None]:
##
## PATH TO DATAFILE
##

datafile = config.NCBI_TRAINING_FILE
print(datafile)

../input/NCBITraining_tagged.pkl


In [None]:
sentences, pos, tag, enc_pos, enc_tag = process_input.inputdata(datafile)

In [None]:
meta_data = {
        "enc_pos": enc_pos,
        "enc_tag": enc_tag
   }
joblib.dump(meta_data, "meta.bin")

In [None]:
(train_sentences, 
 test_sentences, 
 train_pos, 
 test_pos, 
 train_tag, 
 test_tag ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

In [None]:
train_dataset = dataset.EntityDataset(train_sentences, train_pos, train_tag)
valid_dataset = dataset.EntityDataset(test_sentences, test_pos, test_tag)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=0)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=0)

In [None]:
num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))

In [None]:
model = model.EntityModel(config.BASE_MODEL, num_pos=num_pos, num_tag=num_tag)

In [None]:
model = model.to(device)

In [None]:
training_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
print("Number of training steps {}".format(training_steps))

Number of training steps 869


In [None]:
param_optimizer = list(model.named_parameters())

In [None]:
# Parameters EXCLUDED from optimization
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
     {
        "params" : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay" : 0.001,
     }, 
     {
        "params" : [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay" : 0.0,
     },
]

In [None]:
optimizer = AdamW(optimizer_parameters, lr = 3e-5)

In [None]:
scheduler = get_linear_schedule_with_warmup(optimizer = optimizer, num_warmup_steps=0, num_training_steps=training_steps)

In [None]:
best_loss = np.inf
for epoch in range(config.EPOCHS):
  train_loss = engine.train_fn(train_loader, model, optimizer, device, scheduler)
  valid_loss = engine.eval_fn(valid_loader, model, device)
  print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")
  if valid_loss < best_loss:
    torch.save(model.state_dict(), config.MODEL_PATH)
    best_loss = valid_loss    

100%|██████████| 87/87 [02:03<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.85it/s]


Train Loss = 0.7043980281243379 Valid Loss = 0.24125156150414392


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.81it/s]


Train Loss = 0.2277299821719356 Valid Loss = 0.18209536392719317


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.81it/s]


Train Loss = 0.17464960546329103 Valid Loss = 0.16429527218525225


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.75it/s]


Train Loss = 0.14304018722868514 Valid Loss = 0.1602491296063631


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.82it/s]


Train Loss = 0.12165913251282155 Valid Loss = 0.15625865107927567


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.81it/s]


Train Loss = 0.10617662101298914 Valid Loss = 0.1602671176004104


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.84it/s]


Train Loss = 0.09363355594633639 Valid Loss = 0.16331016071713889


100%|██████████| 87/87 [02:02<00:00,  1.41s/it]
100%|██████████| 39/39 [00:05<00:00,  6.80it/s]


Train Loss = 0.08388454115938866 Valid Loss = 0.15757982480602387


100%|██████████| 87/87 [02:02<00:00,  1.40s/it]
100%|██████████| 39/39 [00:05<00:00,  6.84it/s]


Train Loss = 0.0756013823309164 Valid Loss = 0.15442726187981093


100%|██████████| 87/87 [02:02<00:00,  1.40s/it]
100%|██████████| 39/39 [00:05<00:00,  6.89it/s]

Train Loss = 0.07154800858477066 Valid Loss = 0.1544844373009908



