### Relation classifier:
Performing relation classification gives an entity and a role.



*   Text is embedded with special tokens ['[SPAN1_START]', '[SPAN1_END]', '[SPAN2_START]', '[SPAN2_END]'] and added to the tokenizer.
*   These special tokens are embedded before and after span positions of entities and role. Span1 is for entities and Span2 is for Roles.
* Labels are generated for relation pairs present in relation labels of the dataset. Labels are mapped to relation or no relation on basis of the relation mapping.
* Utilized BertForSequenceClassification for model training.
* Adjusted tokenizer and model to accomodate special tokens.
###   **Model**
*   ***Tokenizer:*** BertTokenizerFast
*   ***pre-trained Bert model:*** 'emilyalsentzer/Bio_ClinicalBERT'
### ***Hyperparameters:***
* batch_size=16
* eps=1e-8
* learning_rate=5e-5
* weight_decay=0
* num_train_epochs=10
* patience=3

*   Model stored at project_directory+'models/final_models/Indepent_relation_classifier_v'+str(ver)+'/'
### **Inputs:**
Datasets and their paths:
*   train_data_set path: project_directory+'/data/trainset.json'


*   test_data_set path: project_directory+'/data/testset.json'
### **Metrics:**

           precision    recall  f1-score   support

           0       0.98      0.99      0.99      1938
           1       0.96      0.94      0.95       539

    accuracy                           0.98      2477
   macro avg       0.97      0.97      0.97      2477
weighted avg       0.98      0.98      0.98      2477


## Data Loading and hyperparameters Intialization

### Parameters Initialization

In [None]:
import pandas as pd
from transformers import BertTokenizerFast, BertModel, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm, trange
import json

In [None]:
project_directory='/content/drive/MyDrive/PHD_assessment_gmu/'
save_model_path=project_directory+'/models/'

In [None]:
ver=6
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizerFast.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
batch_size=16
eps=1e-8
learning_rate=5e-5
weight_decay=0
num_train_epochs=10
patience=3
discarded_enities=['EnvironmentalExposure','SexualHistory','InfectiousDiseases','PhysicalActivity']
discarded_roles=['LivingStatus','Other','MedicalCondition','Extent','History']
max_len=512
min_label_size=10

raw_dataset_path=project_directory+'data/'+'SocialHistoryMTSamples.json'
train_dataset_path=project_directory+'data/'+'trainset.json'
test_dataset_path=project_directory+'data/'+'testset.json'
bert_model_name='emilyalsentzer/Bio_ClinicalBERT'

In [None]:
id_label_status={0:'O',1:'B-Status',2:'I-Status'}
id_label_method={0:'O',1:'B-Method',2:'I-Method'}
id_label_role={0:'O',1:'B-Type',2:'I-Type',3:'B-Amount',4:'I-Amount',5:'B-Temporal',6:'I-Temporal',7:'B-Frequency',8:'I-Frequency',9:'B-QuitHistory',10:'I-QuitHistory',11:'B-ExposureHistory',12:'I-ExposureHistory',13:'B-Location',14:'I-Location'}
id_label_ent={0:'O',1:'B-Tobacco',2:'I-Tobacco',3:'B-Alcohol',4:'I-Alcohol',5:'B-Family',6:'I-Family',7:'B-Drug',8:'I-Drug',9:'B-Occupation',10:'I-Occupation',11:'B-MaritalStatus',12:'I-MaritalStatus',13:'B-LivingSituation',14:'I-LivingSituation',15:'B-Residence',16:'I-Residence'}
id_label_event={0:'Not Present',1:'Present'}
label_id_status = {v: k for k, v in id_label_status.items()}
label_id_method = {v: k for k, v in id_label_method.items()}
label_id_role = {v: k for k, v in id_label_role.items()}
label_id_ent = {v: k for k, v in id_label_ent.items()}
label_id_event = {v: k for k, v in id_label_event.items()}

In [None]:
special_tokens_dict = {'additional_special_tokens': ['[SPAN1_START]', '[SPAN1_END]', '[SPAN2_START]', '[SPAN2_END]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)


In [None]:
tokenizer

BertTokenizerFast(name_or_path='emilyalsentzer/Bio_ClinicalBERT', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[SPAN1_END]', '[SPAN2_END]', '[SPAN2_START]', '[SPAN1_START]']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special

### Generate Relation labels

In [None]:
def insert_span_markers(text, spans):
    # Create a list to store the markers that need to be inserted at each index
    insertions = {i: [] for i in range(len(text) + 1)}

    # Populate the insertions dictionary with the correct markers for each span
    sp_cnt=0
    for span in spans:
        sp_cnt=sp_cnt+1
        start, end, category = span
        start=int(start)
        end=int(end)
        insertions[start].append('[SPAN'+str(sp_cnt)+'_START]')
        insertions[end].append('[SPAN'+str(sp_cnt)+'_END]')
        '''
        if category == 'entity':
            insertions[start].append('[SPAN1_START]')
            insertions[end].append('[SPAN1_END]')
        elif category == 'role':
            insertions[start].append('[SPAN2_START]')
            insertions[end].append('[SPAN2_END]')
        '''
    # Construct the new text with markers
    new_text_pieces = []
    for i, char in enumerate(text):
        # Add markers before the current character
        if insertions[i]:
            new_text_pieces.append(' '+' '.join(insertions[i]) + ' ')
        new_text_pieces.append(char)
    # Add any markers that should be inserted after the last character
    if insertions[len(text)]:
        new_text_pieces.append(' ' + ' '.join(insertions[len(text)]))

    # Join all pieces of the new text
    return ''.join(new_text_pieces)
# generate pairs of entity positions and role positions with label relation present and not present
def generate_position_pairs(data):
    '''
    Reads raw processed data and creates relations pairs, formulates input text and labels.
    Adds special tokens SPAN1_START,SPAN1_END (before and after the entity span) SPAN2_START,SPAN2_END (before and after the role span)
    '''
    entity_dict = {e['entity_id']: (e['entity_strt_pos'], e['entity_end_pos']) for e in data['entity_list']}
    role_dict = {r['role_id']: (r['entity_strt_pos'], r['entity_end_pos']) for r in data['role_list']}
    sentence=data['text']
    text_special_list = []
    pairs = []
    for event in data['events_list']:
        entity_pos = list(entity_dict.get(event['entity_id'], None))
        entity_pos.append('entity')
        related_role_positions = [role_dict[rid] for rid in event['Related_roles'] if rid in role_dict]

        for role_pos in related_role_positions:
            role_pos=list(role_pos)
            role_pos.append('role')
            marked_sentence = insert_span_markers(sentence,[entity_pos,role_pos])
            #marked_sentence = f"{sentence[:int(entity_pos[0])]} [SPAN1_START] {sentence[int(entity_pos[0]):int(entity_pos[1])]} [SPAN1_END] {sentence[int(entity_pos[1]):int(role_pos[0])]} [SPAN2_START] {sentence[int(role_pos[0]):int(role_pos[1])]} [SPAN2_END] {sentence[int(role_pos[1]):]}"
            text_special_list.append({'Text': marked_sentence, 'Relation': label_id_event['Present']})

            pairs.append({'Entity_Position': entity_pos, 'Role_Position': role_pos, 'Relation': 'Present'})

        not_related_roles = set(role_dict.keys()) - set(event['Related_roles'])
        for role_id in not_related_roles:
            role_pos = list(role_dict[role_id])
            role_pos.append('role')
            marked_sentence = insert_span_markers(sentence,[entity_pos,role_pos])
            #marked_sentence = f"{sentence[:int(entity_pos[0])]} [SPAN1_START] {sentence[int(entity_pos[0]):int(entity_pos[1])]} [SPAN1_END] {sentence[int(entity_pos[1]):int(role_pos[0])]} [SPAN2_START] {sentence[int(role_pos[0]):int(role_pos[1])]} [SPAN2_END] {sentence[int(role_pos[1]):]}"
            text_special_list.append({'Text': marked_sentence, 'Relation': label_id_event['Not Present'] })
            pairs.append({'Entity_Position': entity_pos, 'Role_Position': role_pos, 'Relation': 'Not Present'})

    return text_special_list


In [None]:
'''
old method
# generate pairs of entity positions and role positions with label relation present and not present
def generate_position_pairs(data):

    #Reads raw processed data and creates relations pairs, formulates input text and labels.
    #Adds special tokens SPAN1_START,SPAN1_END (before and after the entity span) SPAN2_START,SPAN2_END (before and after the role span)

    entity_dict = {e['entity_id']: (e['entity_strt_pos'], e['entity_end_pos']) for e in data['entity_list']}
    role_dict = {r['role_id']: (r['entity_strt_pos'], r['entity_end_pos']) for r in data['role_list']}
    sentence=data['text']
    text_special_list = []
    pairs = []
    for event in data['events_list']:
        entity_pos = entity_dict.get(event['entity_id'], None)

        related_role_positions = [role_dict[rid] for rid in event['Related_roles'] if rid in role_dict]

        for role_pos in related_role_positions:
            marked_sentence = f"{sentence[:int(entity_pos[0])]} [SPAN1_START] {sentence[int(entity_pos[0]):int(entity_pos[1])]} [SPAN1_END] {sentence[int(entity_pos[1]):int(role_pos[0])]} [SPAN2_START] {sentence[int(role_pos[0]):int(role_pos[1])]} [SPAN2_END] {sentence[int(role_pos[1]):]}"
            text_special_list.append({'Text': marked_sentence, 'Relation': label_id_event['Present']})

            pairs.append({'Entity_Position': entity_pos, 'Role_Position': role_pos, 'Relation': 'Present'})

        not_related_roles = set(role_dict.keys()) - set(event['Related_roles'])
        for role_id in not_related_roles:
            role_pos = role_dict[role_id]
            marked_sentence = f"{sentence[:int(entity_pos[0])]} [SPAN1_START] {sentence[int(entity_pos[0]):int(entity_pos[1])]} [SPAN1_END] {sentence[int(entity_pos[1]):int(role_pos[0])]} [SPAN2_START] {sentence[int(role_pos[0]):int(role_pos[1])]} [SPAN2_END] {sentence[int(role_pos[1]):]}"
            text_special_list.append({'Text': marked_sentence, 'Relation': label_id_event['Not Present'] })
            pairs.append({'Entity_Position': entity_pos, 'Role_Position': role_pos, 'Relation': 'Not Present'})

    return text_special_list
'''


'\nold method\n# generate pairs of entity positions and role positions with label relation present and not present\ndef generate_position_pairs(data):\n    \n    #Reads raw processed data and creates relations pairs, formulates input text and labels.\n    #Adds special tokens SPAN1_START,SPAN1_END (before and after the entity span) SPAN2_START,SPAN2_END (before and after the role span)\n    \n    entity_dict = {e[\'entity_id\']: (e[\'entity_strt_pos\'], e[\'entity_end_pos\']) for e in data[\'entity_list\']}\n    role_dict = {r[\'role_id\']: (r[\'entity_strt_pos\'], r[\'entity_end_pos\']) for r in data[\'role_list\']}\n    sentence=data[\'text\']\n    text_special_list = []\n    pairs = []\n    for event in data[\'events_list\']:\n        entity_pos = entity_dict.get(event[\'entity_id\'], None)\n        \n        related_role_positions = [role_dict[rid] for rid in event[\'Related_roles\'] if rid in role_dict]\n\n        for role_pos in related_role_positions:\n            marked_sentenc

In [None]:
trainset=[]
testset=[]
with open(train_dataset_path,'r') as f:
  train_data=json.load(f)
  for data in train_data:
    trainset.extend(generate_position_pairs(data))
with open(test_dataset_path,'r') as f:
  test_data=json.load(f)
  for data in test_data:
    testset.extend(generate_position_pairs(data))

### Dataset and data loader

In [None]:
class RelationDataset(Dataset):
    def __init__(self, data,tokenizer,max_len):
      self.sentences = [item['Text'] for item in data]
      self.labels = [item['Relation'] for item in data]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        inputs=tokenizer(sentence, add_special_tokens=True, padding='max_length',max_length=max_len , truncation=True)
        inputs['labels']=self.labels[idx]
        return inputs

In [None]:
train_dataset = RelationDataset(trainset,tokenizer,max_len)
test_dataset =  RelationDataset(testset,tokenizer,max_len)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=data_collator)

## Model

### Training args and Model Initialization

In [None]:
model = BertForSequenceClassification.from_pretrained(bert_model_name,num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
optimizer=AdamW(model.parameters(),lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*num_train_epochs)



In [None]:
training_args={
    'output_dir':save_model_path,
    'num_train_epochs':num_train_epochs,
    'optimizer':optimizer,
    'scheduler':scheduler,
    'patience':patience,
    'run_name':'Relation_model_v1'
}

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(29000, 768)

In [None]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch['input_ids'].shape)
    print(batch['labels'].shape)
    break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 512])
torch.Size([16])


### Training and evaluation

In [None]:

from tqdm.auto import tqdm
from sklearn.metrics import classification_report

progress_bar = tqdm(range(len(train_dataloader)*num_train_epochs))
early_stopping_cnt=0
val_loss_list=[]
for epoch in range(num_train_epochs):
  loss_epoch=0
  model.train()
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss_epoch+=loss
    loss.backward()

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
  print("Epoch: {}",epoch)
  print("training loss {}".format(loss_epoch.item()))


  model.eval()
  loss_epoch_val=0
  val_predictions=[]
  true_labels=[]
  for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        val_loss=outputs.loss
        loss_epoch_val+=val_loss
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    val_predictions.extend(predictions.tolist())
    true_labels.extend(batch['labels'].tolist())
  print("Validation loss {}".format(loss_epoch_val.item()))
  val_loss_list.append(loss_epoch_val.item())
  if epoch == 0:
    best_val_loss=loss_epoch_val.item()
    early_stopping_cnt=0
  else:
    if loss_epoch_val.item()<best_val_loss:
      best_val_loss=loss_epoch_val.item()
      early_stopping_cnt=0
    else:
      early_stopping_cnt+=1
  if early_stopping_cnt>=patience:
    break
  print(classification_report(true_labels, val_predictions))

  0%|          | 0/6470 [00:00<?, ?it/s]

Epoch: {} 0
training loss 173.04141235351562
Validation loss 17.390546798706055
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1938
           1       0.88      0.96      0.92       539

    accuracy                           0.96      2477
   macro avg       0.94      0.96      0.95      2477
weighted avg       0.97      0.96      0.96      2477

Epoch: {} 1
training loss 47.502140045166016
Validation loss 9.817044258117676
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1938
           1       0.96      0.94      0.95       539

    accuracy                           0.98      2477
   macro avg       0.97      0.97      0.97      2477
weighted avg       0.98      0.98      0.98      2477

Epoch: {} 2
training loss 25.79714584350586
Validation loss 15.262432098388672
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1938

In [None]:
sv_pth=project_directory+'models/final_models/Indepent_relation_classifier_v'+str(ver)+'/'

In [None]:
import os
if not os.path.exists(sv_pth):
    os.makedirs(sv_pth)

In [None]:
model.save_pretrained(sv_pth)

In [None]:
tokenizer.save_pretrained(sv_pth)

('/content/drive/MyDrive/PHD_assessment_gmu/models/final_models/Indepent_relation_classifier_v6/tokenizer_config.json',
 '/content/drive/MyDrive/PHD_assessment_gmu/models/final_models/Indepent_relation_classifier_v6/special_tokens_map.json',
 '/content/drive/MyDrive/PHD_assessment_gmu/models/final_models/Indepent_relation_classifier_v6/vocab.txt',
 '/content/drive/MyDrive/PHD_assessment_gmu/models/final_models/Indepent_relation_classifier_v6/added_tokens.json',
 '/content/drive/MyDrive/PHD_assessment_gmu/models/final_models/Indepent_relation_classifier_v6/tokenizer.json')

In [None]:
'''
from tqdm.auto import tqdm
from sklearn.metrics import classification_report

progress_bar = tqdm(range(len(train_dataloader)*num_train_epochs))
early_stopping_cnt=0
best_val_loss=0
val_loss_list=[]
for epoch in range(num_train_epochs):
  loss_epoch=0
  model.train()
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss_epoch+=loss
    loss.backward()

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
  print("training loss {}".format(loss_epoch.item()))


  model.eval()
  loss_epoch_val=0
  val_predictions=[]
  true_labels=[]
  for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        val_loss=outputs.loss
        loss_epoch_val+=val_loss
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    val_predictions.extend(predictions.tolist())
    true_labels.extend(batch['labels'].tolist())
  print("Validation loss {}".format(loss_epoch_val.item()))
  val_loss_list.append(loss_epoch_val.item())
  if epoch == 0:
    best_val_loss=loss_epoch_val.item()
    early_stopping_cnt=0
  else:
    if loss_epoch_val.item()<best_val_loss:
      best_val_loss=loss_epoch_val.item()
      early_stopping_cnt=0
    else:
      early_stopping_cnt+=1
  if early_stopping_cnt>=patience:
    break

  print(classification_report(true_labels, val_predictions))
  '''

'\nfrom tqdm.auto import tqdm\nfrom sklearn.metrics import classification_report\n\nprogress_bar = tqdm(range(len(train_dataloader)*num_train_epochs))\nearly_stopping_cnt=0\nbest_val_loss=0\nval_loss_list=[]\nfor epoch in range(num_train_epochs):\n  loss_epoch=0\n  model.train()\n  for batch in train_dataloader:\n    batch = {k: v.to(device) for k, v in batch.items()}\n    outputs = model(**batch)\n    loss = outputs.loss\n    loss_epoch+=loss\n    loss.backward()\n\n    optimizer.step()\n    scheduler.step()\n    optimizer.zero_grad()\n    progress_bar.update(1)\n  print("training loss {}".format(loss_epoch.item()))\n\n\n  model.eval()\n  loss_epoch_val=0\n  val_predictions=[]\n  true_labels=[]\n  for batch in test_dataloader:\n    batch = {k: v.to(device) for k, v in batch.items()}\n    with torch.no_grad():\n        outputs = model(**batch)\n        val_loss=outputs.loss\n        loss_epoch_val+=val_loss\n    logits = outputs.logits\n    predictions = torch.argmax(logits, dim=-1)\n 

In [None]:
'''
class RelationClassifier(nn.Module):
    def __init__(self, model_name, tokenizer,num_labels):
        super(RelationClassifier, self).__init__()
        self.tokenizer = tokenizer

        self.bert = BertModel.from_pretrained(model_name)
        self.bert.resize_token_embeddings(len(self.tokenizer))
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        logits = self.classifier(pooled_output)
        return logits
'''

'\nclass RelationClassifier(nn.Module):\n    def __init__(self, model_name, tokenizer,num_labels):\n        super(RelationClassifier, self).__init__()\n        self.tokenizer = tokenizer\n\n        self.bert = BertModel.from_pretrained(model_name)\n        self.bert.resize_token_embeddings(len(self.tokenizer))\n        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)\n\n    def forward(self, input_ids, attention_mask=None):\n        outputs = self.bert(input_ids, attention_mask=attention_mask)\n        pooled_output = outputs[1]\n        logits = self.classifier(pooled_output)\n        return logits\n'