# Biomedical Relation Extraction from Scientific Literature

Baseline BERT model to extract relationships from PubMed articles.

In [1]:
import sys, torch, logging

# fix random seed
torch.manual_seed(0)

# CUDA device if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "mps")

# log level for experiment
logger = logging.getLogger("BioRE")

# code for the baseline model
sys.path.append("./baseline/src")

In [2]:
import wandb

# experiment tracking
wandb.login()

run = wandb.init(
    # Set the project where this run will be logged
    project="biomed-bert-re",
    # Track hyperparameters and run metadata
    config={
        "learning_rate": 1e-05,
        'weight_decay': 0.0001,
        'dropout_rate': 0.1,
        "architecture": "BRAN",
        "dataset": "ChemDisGene",
        "epochs": 100,
    }
)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mramonreszat[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
wandb.finish()



## Batch processing of sequences and relations

In [3]:
from module.data_loader import Dataloader
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', use_fast=True)
chemdisgene = Dataloader('./baseline/data', tokenizer, training=True, logger=logger, lowercase=True)

100%|██████████| 76942/76942 [05:17<00:00, 242.69it/s] 
100%|██████████| 1521/1521 [00:06<00:00, 227.13it/s]
100%|██████████| 1939/1939 [00:14<00:00, 138.32it/s]
100%|██████████| 523/523 [00:03<00:00, 132.50it/s]
100%|██████████| 523/523 [00:03<00:00, 155.41it/s]


In [None]:
chemdisgene.val

In [4]:
chemdisgene.val[5].keys()

dict_keys(['input', 'pad', 'docid', 'input_length', 'label_vectors', 'label_names', 'e1_indicators', 'e2_indicators', 'e1s', 'e2s', 'e1_types', 'e2_types'])

In [11]:
valid_loader.val[5]['label_vectors']

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]

In [None]:
train_loader.val[5]['label_vectors']

In [8]:
len(valid_loader.val)

1480

## Constructing a baseline BERT model

In [4]:
from torchinfo import summary
from module.model import Model

config = {'data_path': './baseline/data', 'learning_rate': 1e-05, 'mode': 'train', 'encoder_type': 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
          'model': 'biaffine', 'output_path': '', 'load_path': '', 'multi_label': True, 'grad_accumulation_steps': 16, 'max_text_length': 512, 
          'dim': 128, 'weight_decay': 0.0001, 'dropout_rate': 0.1, 'max_grad_norm': 10.0, 'epochs': 10, 'patience': 5, 'log_interval': 0.25, 
          'warmup': -1.0, 'cuda': True}

model = Model(config)

summary(model, input_size=[(2, 512), (2, 512)], dtypes=['torch.IntTensor', 'torch.IntTensor'], device="cpu")

Orthogonal pretrainer loss: 1.68e-10


Layer (type:depth-idx)                                  Output Shape              Param #
Model                                                   [2, 1, 512, 512, 15]      245,760
├─BertModel: 1-1                                        [2, 768]                  --
│    └─BertEmbeddings: 2-1                              [2, 512, 768]             --
│    │    └─Embedding: 3-1                              [2, 512, 768]             23,440,896
│    │    └─Embedding: 3-2                              [2, 512, 768]             1,536
│    │    └─Embedding: 3-3                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-4                              [2, 512, 768]             1,536
│    │    └─Dropout: 3-5                                [2, 512, 768]             --
│    └─BertEncoder: 2-2                                 [2, 512, 768]             --
│    │    └─ModuleList: 3-6                             --                        85,054,464
│    └─BertPooler: 2-3      

In [None]:
#
pubmedbert = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

## Training one epoch on biochemical relations

Preload training data to send them to GPU

In [5]:
# Adam with integrated weight decay regularization
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-05,
                  weight_decay=0.0001, eps=1e-8)

# y is 1 or 0, x is 1-d logit
criterion = torch.nn.BCEWithLogitsLoss() 

In [6]:
model.encoder.to(device)

model.head_layer0.to(device)
model.head_layer1.to(device)
model.tail_layer0.to(device)
model.tail_layer1.to(device)

model.biaffine_mat = torch.nn.Parameter(model.biaffine_mat.to(device))

In [7]:
import psutil

train_dataset = []
for batch_num, return_data in enumerate(chemdisgene):

    # Get the virtual memory status
    memory_info = psutil.virtual_memory()

    # Convert used memory from bytes to GB
    used_memory_gb = memory_info.used / (1024 ** 3)

    train_dataset.append(return_data[1])

    if used_memory_gb>=24: # Break if more than 24 GB is collected
        break

    if batch_num>=10000: # Break if more than 1000 batches are collected
        break

In [8]:
def model_forward(input_ids, attention_mask, ep_masks):
    pairwise_scores = model(input_ids, attention_mask)
    ep_masks = ep_masks.unsqueeze(4)
    pairwise_scores = pairwise_scores + ep_masks
    pairwise_scores = torch.logsumexp(pairwise_scores, dim=[2,3])
    outputs = pairwise_scores[:, :, :-1]
    return outputs

In [9]:
from tqdm import tqdm

In [10]:
for epoch in range(wandb.config.epochs):
    model.train()
    train_loss = 0.0
    sample_loss = {}

    for batch, return_data in tqdm(enumerate(train_dataset)):
        (input_ids, attention_mask, ep_masks, e1_indicators, e2_indicators, label_arrays) = return_data

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        ep_masks = ep_masks.to(device)
        labels = label_arrays.to(device)

        optimizer.zero_grad()

        outputs = model_forward(input_ids, attention_mask, ep_masks)

        loss = criterion(outputs, labels)
        train_loss += loss.item()

        loss.backward() 

        optimizer.step()
        wandb.log({"batch": batch, "batch_loss": loss.item()})
    
    train_loss /= len(chemdisgene.train)
    wandb.log({"epoch": epoch, "loss": train_loss})

1638it [02:27, 11.07it/s]
1638it [02:32, 10.74it/s]
1638it [02:37, 10.43it/s]
1638it [02:37, 10.43it/s]
1638it [02:38, 10.35it/s]
1638it [02:39, 10.25it/s]
1638it [02:41, 10.17it/s]
1638it [02:41, 10.17it/s]
1638it [02:40, 10.20it/s]
1638it [02:40, 10.21it/s]
1638it [02:40, 10.20it/s]
1638it [02:40, 10.22it/s]
1638it [02:40, 10.20it/s]
1638it [02:40, 10.19it/s]
1638it [02:40, 10.20it/s]
1638it [02:41, 10.14it/s]
1638it [02:41, 10.17it/s]
1638it [02:41, 10.14it/s]
1638it [02:41, 10.14it/s]
1638it [02:40, 10.22it/s]
1638it [02:41, 10.16it/s]
1638it [02:41, 10.14it/s]
1638it [02:42, 10.09it/s]
1638it [02:42, 10.10it/s]
1638it [02:41, 10.14it/s]
1638it [02:41, 10.12it/s]
1638it [02:42, 10.09it/s]
1638it [02:42, 10.08it/s]
1638it [02:41, 10.14it/s]
1638it [02:41, 10.14it/s]
1638it [02:41, 10.14it/s]
1638it [02:41, 10.15it/s]
1638it [02:41, 10.13it/s]
1638it [02:41, 10.14it/s]
1638it [02:41, 10.12it/s]
1638it [02:41, 10.11it/s]
1638it [02:40, 10.21it/s]
1638it [02:40, 10.19it/s]
1638it [02:4

In [11]:
wandb.finish()



0,1
batch,▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█▁█
batch_loss,▅▆▂▁▂▁▁▇▆▁█▄▂▆▃▁▇▂█▄▂▁▅▃▅▁▂▃▄▂▂▂▇▂▂▁▂▂▁▂
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
batch,1637.0
batch_loss,0.00664
epoch,99.0
loss,0.00124


In [10]:
import tqdm as tqdm

In [13]:
for batch_num, return_data in tqdm(enumerate(valid_loader)):
        (input_array, attention_mask, ep_masks, e1_indicators, e2_indicators, label_arrays) = return_data[1]

    

0it [00:00, ?it/s]


IndexError: list index out of range

In [28]:
training_loss[1560*N]

2.550358533859253

In [74]:
N=1039
sample = [training_loss[1560*epoch+N] for epoch in range(epochs)]

In [75]:
sample

[0.07935695350170135,
 0.07440420985221863,
 0.07168351113796234,
 0.0771072506904602,
 0.07789056748151779,
 0.06667759269475937,
 0.06892801821231842,
 0.06737707555294037,
 0.06749895960092545,
 0.06451743841171265]