In [None]:
#import evaluate


In [None]:
import json
import pandas as pd
from transformers import BertTokenizerFast, BertModel, AdamW, get_linear_schedule_with_warmup, DataCollatorForTokenClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm, trange
import itertools

In [None]:
#metric=evaluate.load("seqeval")
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model_name='emilyalsentzer/Bio_ClinicalBERT'
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)
num_freeze_layers=6

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [None]:
eps=1e-8
learning_rate=7e-5
weight_decay=0.01
num_train_epochs=15
patience=5
batch_size=16
max_len=512


In [None]:
discarded_enities=['EnvironmentalExposure','SexualHistory','InfectiousDiseases','PhysicalActivity','Residence','LivingSituation','MaritalStatus','Occupation']
discarded_roles=['LivingStatus','Other','MedicalCondition','Extent','History']
save_model_path='/content/drive/MyDrive/PHD_assessment_gmu/models/'
project_directory='/content/drive/MyDrive/PHD_assessment_gmu/'
raw_dataset_path=project_directory+'data/'+'SocialHistoryMTSamples.json'
train_dataset_path=project_directory+'data/'+'train_dataset.pth'
test_dataset_path=project_directory+'data/'+'test_dataset.pth'


In [None]:
id_label_status={0:'O',1:'B-Status',2:'I-Status'}
id_label_method={0:'O',1:'B-Method',2:'I-Method'}
id_label_role={0:'O',1:'B-Type',2:'I-Type',3:'B-Amount',4:'I-Amount',5:'B-Temporal',6:'I-Temporal',7:'B-Frequency',8:'I-Frequency',9:'B-QuitHistory',10:'I-QuitHistory',11:'B-ExposureHistory',12:'I-ExposureHistory',13:'B-Location',14:'I-Location'}
id_label_event={0:'No Relation',1:'Relation'}

label_id_status = {v: k for k, v in id_label_status.items()}
label_id_method = {v: k for k, v in id_label_method.items()}
label_id_role = {v: k for k, v in id_label_role.items()}
label_id_ent = {'B-Alcohol':1,
 'B-Drug':3,
 'B-Family':5,
 'B-Tobacco':7,
 'I-Alcohol':2,
 'I-Drug':4,
 'I-Family':6,
 'I-Tobacco':8,
 'O':0}
id_label_ent = {v: k for k, v in label_id_ent.items()}
label_id_event = {v: k for k, v in id_label_event.items()}


### Label generation

In [None]:

class GenerateLabel:

  @staticmethod
  def generate_enity_labels(entity_list, token_len, token_offsets):
    # Initialize a list to store the BIO labels for each token

    entity_labels = [label_id_ent['O']] * token_len
    for entity in entity_list:
        category = entity['entity_category']
        if category not in discarded_enities:
          entity_start_pos = int(entity['entity_strt_pos'])
          entity_end_pos = int(entity['entity_end_pos'])-1

          # Find tokens that correspond to the entity's position
          entity_start_token = None
          entity_end_token = None

          for i, (start_offset, end_offset) in enumerate(token_offsets):
              if entity_start_token is None and start_offset >= entity_start_pos:
                  entity_start_token = i
              if end_offset > entity_end_pos:
                  entity_end_token = i
                  break

          # Assign BIO labels to the tokens
          if entity_start_token is not None:
            entity_labels[entity_start_token] = label_id_ent['B-' + category]
            if entity_end_token is not None:
              entity_labels[entity_start_token + 1:entity_end_token + 1] = [label_id_ent['I-' + category]] * (entity_end_token - entity_start_token)

    return entity_labels

  @staticmethod
  def generate_role_labels(role_list, token_len, token_offsets):
    # Initialize a list to store the BIO labels for each token
    role_labels = [label_id_role['O']] * token_len
    status_labels = [label_id_status['O']] * token_len
    method_labels = [label_id_method['O']] * token_len

    for role in role_list:
        category = role['entity_category']
        entity_start_pos = int(role['entity_strt_pos'])
        entity_end_pos = int(role['entity_end_pos'])-1
        if category in discarded_roles:
          continue
        # Find tokens that correspond to the entity's position
        entity_start_token = None
        entity_end_token = None

        for i, (start_offset, end_offset) in enumerate(token_offsets):
            if entity_start_token is None and start_offset >= entity_start_pos:
                entity_start_token = i
            if end_offset > entity_end_pos:
                entity_end_token = i
                break

        # Assign BIO labels to the tokens
        if category == 'Status':
          if entity_start_token is not None:
            status_labels[entity_start_token] = label_id_status['B-' + category]
            if entity_end_token is not None:
                status_labels[entity_start_token + 1:entity_end_token + 1] = [label_id_status['I-' + category]] * (entity_end_token - entity_start_token)

        elif category == 'Method':
          if entity_start_token is not None:
            method_labels[entity_start_token] = label_id_method['B-' + category]
            if entity_end_token is not None:
                method_labels[entity_start_token + 1:entity_end_token + 1] = [label_id_method['I-' + category]] * (entity_end_token - entity_start_token)

        else:
          if entity_start_token is not None:
            role_labels[entity_start_token] = label_id_role['B-' + category]
            if entity_end_token is not None:
                role_labels[entity_start_token + 1:entity_end_token + 1] = [label_id_role['I-' + category]] * (entity_end_token - entity_start_token)


    return role_labels, status_labels, method_labels

  @staticmethod
  def generate_relation_labels( token_offsets, data):
    entity_token_indices = {}
    role_token_indices = {}

    entity_list = data.get('entity_list', [])
    role_list = data.get('role_list', [])
    events_list = data.get('events_list', [])

    # Create a dictionary to map entity IDs to their token indices
    for entity in entity_list:
        entity_id = entity['entity_id']
        entity_start_pos = int(entity['entity_strt_pos'])
        entity_end_pos = int(entity['entity_end_pos'])-1
        if entity['entity_category'] in discarded_enities:
          continue
        # Find tokens that correspond to the entity's position
        entity_start_token = None
        entity_end_token = None

        for i, (start_offset, end_offset) in enumerate(token_offsets):
            if entity_start_token is None and start_offset >= entity_start_pos:
                entity_start_token = i
            if end_offset > entity_end_pos:
                entity_end_token = i
                break

        if entity_start_token is not None:
            entity_token_indices[entity_id] = (entity_start_token, entity_end_token,entity['entity_category'])

    # Create a dictionary to map event-related role IDs to their token indices
    for event in events_list:
        entity_id = event['entity_id']
        related_roles = event['Related_roles']

        entity_indices = entity_token_indices.get(entity_id, None)

        if entity_indices:
            role_indices = []

            for role_id in related_roles:
                role = next((role for role in role_list if role['role_id'] == role_id), None)

                if role:
                    role_start_pos = int(role['entity_strt_pos'])
                    role_end_pos = int(role['entity_end_pos'])-1
                    if role['entity_category'] in discarded_roles:
                      continue

                    role_start_token = None
                    role_end_token = None

                    for i, (start_offset, end_offset) in enumerate(token_offsets):
                        if role_start_token is None and start_offset >= role_start_pos:
                            role_start_token = i
                        if end_offset > role_end_pos:
                            role_end_token = i
                            break

                    if role_start_token is not None:
                        role_indices.append((role_start_token, role_end_token,role['entity_category']))

            if role_indices:
                role_token_indices[(entity_indices[0],entity_indices[1],entity_indices[2])] = role_indices

    return role_token_indices

### Dataset Class

In [None]:
class ERDataset(Dataset):
  def __init__(self, data, tokenizer, max_len):
    self.data = data
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.data)
  def __getitem__(self, idx):
    item=self.data[idx]
    text = item['text']
    inputs = self.tokenizer(text,max_length=max_len,truncation=True,return_offsets_mapping=True)
    tokens_len=len(inputs['input_ids'])
    offset_mapping_list=inputs['offset_mapping']
    entity_labels=GenerateLabel.generate_enity_labels(item['entity_list'],tokens_len,offset_mapping_list)
    role_labels, status_labels, method_labels=GenerateLabel.generate_role_labels(item['role_list'],tokens_len,offset_mapping_list)
    relation_labels=GenerateLabel.generate_relation_labels(offset_mapping_list,item)
    return {
      'input_ids':inputs['input_ids'],
      'attention_mask':inputs['attention_mask'],
      'entity_labels':entity_labels,
      'role_labels':role_labels,
      'status_labels':status_labels,
      'method_labels':method_labels,
      'relation_labels':relation_labels,
      'text':text,
      'file_name':item['file_name'],
      'offset_mapping':offset_mapping_list,
      'tokens':inputs.tokens()
    }

### NER classifier

In [None]:
class EntityBertModel(nn.Module):
  def __init__(self, model_name, num_freeze_layers,num_status_labels,num_method_labels,num_role_labels,num_entity_labels, dropout=0.1):
    super(EntityBertModel, self).__init__()
    self.bertmodel = BertModel.from_pretrained(model_name)
    for layer in self.bertmodel.encoder.layer[:num_freeze_layers]:
      for param in layer.parameters():
          param.requires_grad = False
    self.dropout = nn.Dropout(dropout)
    self.status_classifier = nn.Linear(self.bertmodel.config.hidden_size, num_status_labels)
    self.method_classifier = nn.Linear(self.bertmodel.config.hidden_size, num_method_labels)
    self.role_classifier = nn.Linear(self.bertmodel.config.hidden_size, num_role_labels)
    self.entity_classifier = nn.Linear(self.bertmodel.config.hidden_size, num_entity_labels)
  def forward(self, input_ids, attention_mask):
    bert_output = self.bertmodel(input_ids=input_ids, attention_mask=attention_mask)
    sequence_output = self.dropout(bert_output[0])
    status_logits = self.status_classifier(sequence_output)
    method_logits = self.method_classifier(sequence_output)
    role_logits = self.role_classifier(sequence_output)
    entity_logits = self.entity_classifier(sequence_output)

    return status_logits, method_logits, role_logits, entity_logits

## Data Loading

In [9]:
import json
prediction_data={}
dt_set='tst'
for label_type in ['entity','role_labels','status_labels','method_labels']:
  predpath='/content/drive/MyDrive/PHD_assessment_gmu/data/'+label_type+'_predictions_'+dt_set+'.json'
  with open(predpath, 'r', encoding='utf-8') as file:
    prediction_data[label_type]=json.load( file)


In [11]:
prediction_data['entity']

{'124_Consult-HistoryandPhy.-Consult_ERReport-OB_GYN_8': [['[CLS]', 0, 0],
  ['social', 0, 0],
  ['history', 0, 0],
  [':', 0, 0],
  ['the', 0, 0],
  ['patient', 0, 0],
  ['denies', 0, 0],
  ['tobacco', 7, 7],
  [',', 0, 0],
  ['ethanol', 1, 1],
  [',', 0, 0],
  ['or', 0, 0],
  ['drug', 3, 3],
  ['use', 0, 0],
  ['.', 0, 0],
  ['she', 0, 0],
  ['is', 0, 0],
  ['currently', 0, 0],
  ['separated', 0, 0],
  ['from', 0, 0],
  ['her', 0, 0],
  ['partner', 0, 0],
  ['who', 0, 0],
  ['is', 0, 0],
  ['the', 0, 0],
  ['father', 0, 5],
  ['of', 0, 0],
  ['her', 0, 0],
  ['21', 0, 0],
  ['-', 0, 0],
  ['month', 0, 0],
  ['-', 0, 0],
  ['old', 0, 5],
  ['daughter', 5, 6],
  ['.', 0, 0],
  ['she', 0, 0],
  ['currently', 0, 0],
  ['lives', 0, 0],
  ['with', 0, 0],
  ['her', 0, 0],
  ['parents', 0, 0],
  ['in', 0, 0],
  ['green', 0, 0],
  ['##ville', 0, 0],
  ['.', 0, 0],
  ['however', 0, 0],
  [',', 0, 0],
  ['she', 0, 0],
  ['was', 0, 0],
  ['visiting', 0, 0],
  ['the', 0, 0],
  ['est', 0, 0],
  ['

In [None]:
import json
prediction_file_path='/content/drive/MyDrive/PHD_assessment_gmu/data/tst_Flt_ent_role_model_2.json'

In [None]:
with open(prediction_file_path, 'r', encoding='utf-8') as file:
  prediction_data=json.load( file)

In [None]:
entity_dict = prediction_data['entity']
role_dict=prediction_data['role']
method_dict=prediction_data['method']
status_dict=prediction_data['status']

In [None]:
entity_dict['430_Consult-HistoryandPhy.-PsychConsult-Depression-1_9']

[['[CLS]', 0, 0],
 ['substance', 0, 0],
 ['history', 0, 0],
 [':', 0, 0],
 ['the', 0, 0],
 ['patient', 1, 0],
 ['has', 0, 0],
 ['been', 0, 0],
 ['sober', 0, 0],
 ['for', 0, 0],
 ['five', 0, 0],
 ['years', 0, 0],
 ['.', 0, 0],
 ['she', 0, 0],
 ['drank', 1, 0],
 ['one', 0, 0],
 ['bottle', 0, 0],
 ['of', 0, 0],
 ['wine', 0, 0],
 ['per', 0, 0],
 ['day', 0, 0],
 ['as', 0, 0],
 ['per', 0, 0],
 ['hp', 0, 0],
 ['##i', 0, 0],
 ['.', 0, 0],
 ['history', 0, 0],
 ['of', 0, 0],
 ['drinking', 1, 1],
 ['for', 0, 0],
 ['approximately', 0, 0],
 ['25', 0, 0],
 ['years', 0, 0],
 ['.', 0, 0],
 ['the', 0, 0],
 ['patient', 0, 0],
 ['does', 0, 0],
 ['not', 0, 0],
 ['currently', 0, 0],
 ['have', 0, 0],
 ['a', 0, 0],
 ['sponsor', 0, 0],
 ['.', 0, 0],
 ['the', 0, 0],
 ['patient', 0, 0],
 ['experimented', 0, 0],
 ['with', 0, 0],
 ['am', 3, 3],
 ['##phe', 4, 4],
 ['##tamine', 4, 4],
 ['##s', 4, 4],
 [',', 0, 0],
 ['cocaine', 3, 3],
 [',', 0, 0],
 ['marijuana', 3, 3],
 ['approximately', 0, 0],
 ['16', 0, 0],
 ['ye

In [None]:
train_dataset=torch.load(train_dataset_path)
test_dataset=torch.load(test_dataset_path)

In [None]:
def extract_spans_from_labels(labels):
    spans = []
    current_span = []
    current_label = None
    last_valid_index = -1  # Tracks the last index of a non-'O' label

    for i, label in enumerate(labels):
        # Ignore BIO scheme and consider only the entity type
        simplified_label = label[2:] if label.startswith(('B-', 'I-')) else label

        # Start a new span or continue the current one
        if simplified_label != 'O':
            # If starting a new span or within allowable break from last non-'O'
            if current_label is None or simplified_label != current_label or i - last_valid_index > 3:
                # Save the current span before starting a new one and reset the current span
                if current_span:
                    spans.append((current_label, current_span))
                    current_span = []
                current_label = simplified_label
            current_span.append(i)
            last_valid_index = i
        elif current_span and i - last_valid_index > 3:
            # End the current span if the break is too long
            spans.append((current_label, current_span))
            current_span = []
            current_label = None

    # Add the last span if exists
    if current_span:
        spans.append((current_label, current_span))

    return spans



In [None]:
def correct_span_positions(span_pos):
  updated_span_pos=[]
  for ele in span_pos:
    if len(ele[1])==1:
      updated_span_pos.append((ele[0],(ele[1][0],ele[1][0])))
    elif len(ele[1])>=2:
      strt_pos=ele[1][0]
      end_pos=ele[1][-1]
      updated_span_pos.append((ele[0],(ele[1][0],ele[1][-1])))
  return updated_span_pos
def convert_id_label(prediction_data,id_label):
  prediction_data=[id_label[ele[2]] for ele in prediction_data]
  return prediction_data
def generate_prediction_spans(prediction_data_ent,prediction_data_role,prediction_data_status,prediction_data_method,id_label_ent,id_label_method,id_label_role,id_label_status):
  prediction_data_ent=convert_id_label(prediction_data_ent,id_label_ent)
  prediction_data_role=convert_id_label(prediction_data_role,id_label_role)
  prediction_data_method=convert_id_label(prediction_data_method,id_label_method)
  prediction_data_status=convert_id_label(prediction_data_status,id_label_status)
  pred_spans_ent=extract_spans_from_labels(prediction_data_ent)
  pred_spans_roles=extract_spans_from_labels(prediction_data_role)
  pred_spans_status=extract_spans_from_labels(prediction_data_status)
  pred_spans_method=extract_spans_from_labels(prediction_data_method)
  pred_spans_ent=correct_span_positions(pred_spans_ent)
  pred_spans_roles=correct_span_positions(pred_spans_roles)
  pred_spans_status=correct_span_positions(pred_spans_status)
  pred_spans_method=correct_span_positions(pred_spans_method)
  relation_pairs=list(itertools.product(pred_spans_ent, pred_spans_roles))
  relation_pairs.extend(list(itertools.product(pred_spans_ent, pred_spans_status)))
  relation_pairs.extend(list(itertools.product(pred_spans_ent, pred_spans_method)))
  relation_pair_labels_dict = {key: 'No Relation' for key in relation_pairs}
  return pred_spans_ent,pred_spans_roles,pred_spans_status,pred_spans_method,relation_pair_labels_dict
def insert_span_markers(text, spans):
    # Create a list to store the markers that need to be inserted at each index
    insertions = {i: [] for i in range(len(text) + 1)}

    # Populate the insertions dictionary with the correct markers for each span
    sp_cnt=0
    for span in spans:
        sp_cnt=sp_cnt+1
        start, end, category = span
        start=int(start)
        end=int(end)
        insertions[start].append('[SPAN'+str(sp_cnt)+'_START]')
        insertions[end].append('[SPAN'+str(sp_cnt)+'_END]')
        '''
        if category == 'entity':
            insertions[start].append('[SPAN1_START]')
            insertions[end].append('[SPAN1_END]')
        elif category == 'role':
            insertions[start].append('[SPAN2_START]')
            insertions[end].append('[SPAN2_END]')
        '''
    # Construct the new text with markers
    new_text_pieces = []
    for i, char in enumerate(text):
        # Add markers before the current character
        if insertions[i]:
            new_text_pieces.append(' '+' '.join(insertions[i]) + ' ')
        new_text_pieces.append(char)
    # Add any markers that should be inserted after the last character
    if insertions[len(text)]:
        new_text_pieces.append(' ' + ' '.join(insertions[len(text)]))

    # Join all pieces of the new text
    return ''.join(new_text_pieces)

In [None]:
def generate_relation_prediction_data(sentence,file_name,tokenizer_outputs,relation_pair_labels_dict):
  data_input_list=[]
  offset_mapping_list=tokenizer_outputs['offset_mapping']
  for pair,pair_label in relation_pair_labels_dict.items():
    entity_pos=[]
    entity_pos.append(offset_mapping_list[pair[0][1][0]][0])
    entity_pos.append(offset_mapping_list[pair[0][1][1]][1])
    entity_pos.append('entity')
    role_pos=[]
    role_pos.append(offset_mapping_list[pair[1][1][0]][0])
    role_pos.append(offset_mapping_list[pair[1][1][1]][1])
    role_pos.append('role')
    marked_sentence = insert_span_markers(sentence,[entity_pos,role_pos])
    data_input_list.append({'file_name':file_name,'text':marked_sentence,'Entity':pair[0][0],'Role':pair[1][0],'label':pair_label})
  return data_input_list

In [None]:
def preparing_relation_labels(relation_labels,pred_spans_ent,pred_spans_roles,pred_spans_status,pred_spans_method,relation_pair_labels_dict):
  identified_relations_cnt=0
  print(len(relation_pair_labels_dict))
  print(pred_spans_ent)
  for span in pred_spans_ent:
    start_token_index=span[1][0]
    end_token_index=span[1][1]
    print('pes:',span)
    for keys,values in relation_labels.items():
      rel_start_token_ind=keys[0]
      rel_end_token_ind=keys[1]
      print('--------------------------------------------------')
      print('Entity')
      print('rlb:',rel_start_token_ind)
      print('rlb:',rel_end_token_ind)
      if (start_token_index>=rel_start_token_ind-2 and end_token_index<=rel_end_token_ind+2) or (rel_start_token_ind>=start_token_index-2 and rel_end_token_ind<=end_token_index):
        print('Entity Match')
        for ele in values:
          print('################')
          print('Role')
          print('rle:',ele)
          print('ppppppppp')
          if ele[2]=='Status':
            print('preictionspans_list:',pred_spans_status)
            for status_span in pred_spans_status:
              print('prs:',status_span)
              if (status_span[1][0]>=ele[0]-2 and status_span[1][1]<=ele[1]+2) or ((ele[0]>=status_span[1][0]-2 and ele[1]+2<=status_span[1][1])):
                pair_key=((span[0], (start_token_index, end_token_index)), (status_span[0], (status_span[1][0], status_span[1][1])))

                relation_pair_labels_dict[pair_key]='Relation'

                print('Match')
                identified_relations_cnt += 1
                #break
          elif ele[2]=='Method':
            for method_span in pred_spans_method:
              print('prs:',method_span)
              if (method_span[1][0]>=ele[0]-2 and method_span[1][1]<=ele[1]+2) or ((ele[0]>=method_span[1][0]-2 and ele[1]+2<=method_span[1][1])):
                pair_key=((span[0], (start_token_index, end_token_index)), (method_span[0], (method_span[1][0], method_span[1][1])))

                relation_pair_labels_dict[pair_key]='Relation'
                print('Match')
                identified_relations_cnt += 1
                #break
          else:

            print(pred_spans_roles)
            for role_span in pred_spans_roles:
              print('prs:',role_span)
              if (role_span[1][0]>=ele[0]-2 and role_span[1][1]<=ele[1]+2) or ((ele[0]>=role_span[1][0]-2 and ele[1]+2<=role_span[1][1])):
                pair_key=((span[0], (start_token_index, end_token_index)), (role_span[0], (role_span[1][0], role_span[1][1])))

                relation_pair_labels_dict[pair_key]='Relation'

                print('Match')
                identified_relations_cnt += 1
                #break
        break

  print(relation_pair_labels_dict)
  return relation_pair_labels_dict, identified_relations_cnt



def generate_relation_data(text,file_name,prediction_data_ent,prediction_data_role,prediction_data_status,prediction_data_method,id_label_ent,id_label_method,id_label_role,id_label_status,relation_labels=None):
  inputs = tokenizer(text,max_length=max_len,truncation=True,return_offsets_mapping=True)

  tokens_len=len(inputs['input_ids'])
  if tokens_len==len(prediction_data_ent):
    pred_spans_ent,pred_spans_roles,pred_spans_status,pred_spans_method,relation_pair_labels_dict=generate_prediction_spans(prediction_data_ent,prediction_data_role,prediction_data_status,prediction_data_method,id_label_ent,id_label_method,id_label_role,id_label_status)
    if relation_labels:
      present_relation_label_cnt=0
      for e,r in relation_labels.items():
        present_relation_label_cnt += len(r)
      relation_pair_labels_dict,identified_relations_cnt=preparing_relation_labels(relation_labels,pred_spans_ent,pred_spans_roles,pred_spans_status,pred_spans_method,relation_pair_labels_dict)
      relation_data=generate_relation_prediction_data(text,file_name,inputs,relation_pair_labels_dict)
      return relation_data,present_relation_label_cnt,identified_relations_cnt,present_relation_label_cnt-identified_relations_cnt
    else:
      relation_data=generate_relation_prediction_data(text,file_name,inputs,relation_pair_labels_dict)
      return relation_data
  else:
    return None,None



In [None]:
flnm='259_Consult-HistoryandPhy.-Head&NeckCancerConsult_9'
rlb=''
t=''
for tst_doc in test_dataset:
  if tst_doc['file_name']==flnm:
    rlb=tst_doc['relation_labels']
    t=tst_doc['text']

In [None]:
t

'SOCIAL HISTORY:  The patient is married but has been separated from his wife for many years, they remain close, and they have two adult sons. He is retired from the Air Force, currently works for Lockheed Martin. He was born and raised in New York. He does have a smoking history, about a 20 pack-year history and he reports quitting on July 27. He does drink alcohol socially. No use of illicit drugs.\n\n'

In [None]:
rlb

{(26, 28, 'Family'): [(27, 28, 'Type'), (26, 26, 'Amount')],
 (84, 86, 'Alcohol'): [(84, 85, 'Status'), (87, 87, 'Amount')],
 (94, 94, 'Drug'): [(92, 93, 'Type'), (89, 90, 'Status')],
 (62, 62, 'Tobacco'): [(67, 70, 'Amount'),
  (74, 81, 'Status'),
  (78, 81, 'QuitHistory')]}

In [None]:

rel_dt_lst1,rel_present1,identified_rel1,missed_rel_perc1=generate_relation_data(t,flnm,entity_dict[flnm],role_dict[flnm],status_dict[flnm],method_dict[flnm],id_label_ent,id_label_method,id_label_role,id_label_status,relation_labels=rlb)

120
[('Family', (14, 14)), ('Family', (26, 28)), ('Tobacco', (62, 62)), ('Alcohol', (86, 86)), ('Drug', (94, 94))]
pes: ('Family', (14, 14))
--------------------------------------------------
Entity
rlb: 26
rlb: 28
--------------------------------------------------
Entity
rlb: 84
rlb: 86
--------------------------------------------------
Entity
rlb: 94
rlb: 94
--------------------------------------------------
Entity
rlb: 62
rlb: 62
pes: ('Family', (26, 28))
--------------------------------------------------
Entity
rlb: 26
rlb: 28
Entity Match
################
Role
rle: (27, 28, 'Type')
ppppppppp
[('Type', (7, 7)), ('Type', (14, 14)), ('Temporal', (15, 17)), ('Amount', (26, 26)), ('Type', (27, 28)), ('Location', (35, 36)), ('Location', (41, 46)), ('Location', (54, 56)), ('Amount', (67, 70)), ('Location', (80, 80)), ('Amount', (87, 87)), ('Type', (92, 93))]
prs: ('Type', (7, 7))
prs: ('Type', (14, 14))
prs: ('Temporal', (15, 17))
prs: ('Amount', (26, 26))
Match
prs: ('Type', (27, 28))
M

In [None]:
rlb

{(15, 16, 'Family'): [(15, 16, 'Type'), (15, 15, 'Amount')],
 (19, 20, 'Tobacco'): [(19, 19, 'Status'),
  (21, 23, 'QuitHistory'),
  (26, 31, 'ExposureHistory')],
 (36, 37, 'Alcohol'): [(34, 35, 'Status')]}

In [None]:
relation_df1=pd.DataFrame(rel_dt_lst1)
relation_df1['predict_relation']=relation_df1['text'].apply(predict_relation)
relation_df1[relation_df1['label']!=relation_df1['predict_relation']]

Unnamed: 0,file_name,text,Entity,Role,label,predict_relation
4,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long [SPAN...,Tobacco,Status,No Relation,Relation


In [None]:
relation_df1[relation_df1['label']!=relation_df1['predict_relation']]['text'].iloc[0]

'SOCIAL HISTORY:  The patient had a long  [SPAN2_START] history of [SPAN2_END]   [SPAN1_START] smoking [SPAN1_END]  but quit many years ago. He does have chronic alcohol use.\n\n'

In [None]:
relation_data_list=[]
missed_rel_perc_list=[]
identified_all_rel_doc=[]
total_missed_rel=0
total_relations_present=0
for tst_doc in test_dataset:
  rel_dt_lst,rel_present,identified_rel,missed_rel_perc=generate_relation_data(tst_doc['text'],tst_doc['file_name'],entity_dict[tst_doc['file_name']],role_dict[tst_doc['file_name']],status_dict[tst_doc['file_name']],method_dict[tst_doc['file_name']],id_label_ent,id_label_method,id_label_role,id_label_status,relation_labels=tst_doc['relation_labels'])
  if rel_dt_lst:
    #if missed_rel_perc<0:
    # missed_rel_perc=0
    relation_data_list.extend(rel_dt_lst)
    total_missed_rel += missed_rel_perc
    total_relations_present += rel_present
    if missed_rel_perc>0:
      missed_rel_perc_list.append([tst_doc['file_name'],rel_present,missed_rel_perc])
    elif missed_rel_perc==0:
      identified_all_rel_doc.append([tst_doc['file_name'],rel_present,identified_rel])



27
[('Family', (15, 16)), ('Tobacco', (20, 20)), ('Alcohol', (36, 37))]
pes: ('Family', (15, 16))
--------------------------------------------------
Entity
rlb: 15
rlb: 16
Entity Match
################
Role
rle: (15, 16, 'Type')
ppppppppp
[('Type', (11, 11)), ('Amount', (15, 15)), ('Type', (16, 16)), ('QuitHistory', (21, 23)), ('ExposureHistory', (26, 28))]
prs: ('Type', (11, 11))
prs: ('Amount', (15, 15))
Match
prs: ('Type', (16, 16))
Match
prs: ('QuitHistory', (21, 23))
prs: ('ExposureHistory', (26, 28))
################
Role
rle: (15, 15, 'Amount')
ppppppppp
[('Type', (11, 11)), ('Amount', (15, 15)), ('Type', (16, 16)), ('QuitHistory', (21, 23)), ('ExposureHistory', (26, 28))]
prs: ('Type', (11, 11))
prs: ('Amount', (15, 15))
Match
prs: ('Type', (16, 16))
Match
prs: ('QuitHistory', (21, 23))
prs: ('ExposureHistory', (26, 28))
pes: ('Tobacco', (20, 20))
--------------------------------------------------
Entity
rlb: 15
rlb: 16
--------------------------------------------------
Entity


In [None]:
missed_rel_perc_list

[['116_Consult-HistoryandPhy.-Consult-Sepsis_6', 8, 1],
 ['285_Consult-HistoryandPhy.-IschemicCecum-Consult_8', 5, 3],
 ['162_Consult-HistoryandPhy.-ENTConsult_14', 5, 1],
 ['405_Consult-HistoryandPhy.-PainManagementConsult-2_7', 8, 3],
 ['419_Consult-HistoryandPhy.-PreopCardiacConsult_9', 7, 2],
 ['430_Consult-HistoryandPhy.-PsychConsult-Depression-1_9', 17, 12],
 ["270_Consult-HistoryandPhy.-Huntington'sDisease-Consult_11", 7, 6],
 ['28_Consult-HistoryandPhy.-AttemptedSuicide-Consult_10', 6, 5],
 ['407_Consult-HistoryandPhy.-PediatricRheumatologyConsult_8', 4, 3],
 ["514_Consult-HistoryandPhy.-Worker'sCompensationInjury_9", 3, 1],
 ['233_Consult-HistoryandPhy.-GenMedConsult-52_7', 1, 1],
 ['325_Consult-HistoryandPhy.-Murmur&Bacteremia._9', 4, 1],
 ['132_Consult-HistoryandPhy.-DetoxfromHeroin_7', 7, 4],
 ['399_Consult-HistoryandPhy.-OrthopedicConsult-3_9', 1, 1],
 ['8_Consult-HistoryandPhy.-AcuteInferiorMyocardialInfarction_7', 6, 3],
 ['524_Consult-HistoryandPhy.-AcuteInferiorMyocard

In [None]:
relation_df=pd.DataFrame(relation_data_list)

In [None]:
relation_df['label'].value_counts()

No Relation    1172
Relation        261
Name: label, dtype: int64

In [None]:
rel_classifier_pth='/content/drive/MyDrive/PHD_assessment_gmu/models/final_models/Indepent_relation_classifier_v6/'

In [None]:
from transformers import BertForSequenceClassification
tokenizer2=BertTokenizerFast.from_pretrained(rel_classifier_pth)
rel_model=BertForSequenceClassification.from_pretrained(rel_classifier_pth)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
rel_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
def predict_relation(text):
  rel_model.eval()
  inputs=tokenizer2(text, add_special_tokens=True, padding='max_length',max_length=max_len , truncation=True,return_tensors='pt')
  with torch.no_grad():
    inp={k: v.to(device) for k, v in inputs.items()}
    outputs = rel_model(**inp)
  logits = outputs.logits
  raw_predictions = torch.argmax(logits, dim=-1)
  prediction=id_label_event[raw_predictions.item()]
  return prediction

In [None]:
relation_df['predict_relation']=relation_df['text'].apply(predict_relation)

In [None]:
relation_df.shape

(1433, 5)

In [None]:
relation_df

Unnamed: 0,file_name,text,Entity,Role,label,predict_relation
0,112_Consult-HistoryandPhy.-Consult-RectalBleed...,SOCIAL HISTORY: The patient is retired. He is...,Family,Type,No Relation,No Relation
1,112_Consult-HistoryandPhy.-Consult-RectalBleed...,SOCIAL HISTORY: The patient is retired. He is...,Family,Amount,Relation,Relation
2,112_Consult-HistoryandPhy.-Consult-RectalBleed...,SOCIAL HISTORY: The patient is retired. He is...,Family,Type,Relation,Relation
3,112_Consult-HistoryandPhy.-Consult-RectalBleed...,SOCIAL HISTORY: The patient is retired. He is...,Family,QuitHistory,No Relation,No Relation
4,112_Consult-HistoryandPhy.-Consult-RectalBleed...,SOCIAL HISTORY: The patient is retired. He is...,Family,ExposureHistory,No Relation,No Relation
...,...,...,...,...,...,...
1428,185_Consult-HistoryandPhy.-GastrointestinalBle...,SOCIAL HISTORY: She is a nonsmoker. She occasi...,Alcohol,Status,Relation,Relation
1429,185_Consult-HistoryandPhy.-GastrointestinalBle...,SOCIAL HISTORY: She is a nonsmoker. She occasi...,Alcohol,Status,No Relation,No Relation
1430,185_Consult-HistoryandPhy.-GastrointestinalBle...,SOCIAL HISTORY: She is a [SPAN2_START] non [S...,Family,Status,No Relation,No Relation
1431,185_Consult-HistoryandPhy.-GastrointestinalBle...,SOCIAL HISTORY: She is a nonsmoker. She occasi...,Family,Status,No Relation,No Relation


In [None]:
missed_rel_perc_list

[['116_Consult-HistoryandPhy.-Consult-Sepsis_6', 8, 1],
 ['285_Consult-HistoryandPhy.-IschemicCecum-Consult_8', 5, 3],
 ['162_Consult-HistoryandPhy.-ENTConsult_14', 5, 1],
 ['405_Consult-HistoryandPhy.-PainManagementConsult-2_7', 8, 3],
 ['419_Consult-HistoryandPhy.-PreopCardiacConsult_9', 7, 2],
 ['430_Consult-HistoryandPhy.-PsychConsult-Depression-1_9', 17, 12],
 ["270_Consult-HistoryandPhy.-Huntington'sDisease-Consult_11", 7, 6],
 ['28_Consult-HistoryandPhy.-AttemptedSuicide-Consult_10', 6, 5],
 ['407_Consult-HistoryandPhy.-PediatricRheumatologyConsult_8', 4, 3],
 ["514_Consult-HistoryandPhy.-Worker'sCompensationInjury_9", 3, 1],
 ['233_Consult-HistoryandPhy.-GenMedConsult-52_7', 1, 1],
 ['325_Consult-HistoryandPhy.-Murmur&Bacteremia._9', 4, 1],
 ['132_Consult-HistoryandPhy.-DetoxfromHeroin_7', 7, 4],
 ['399_Consult-HistoryandPhy.-OrthopedicConsult-3_9', 1, 1],
 ['8_Consult-HistoryandPhy.-AcuteInferiorMyocardialInfarction_7', 6, 3],
 ['524_Consult-HistoryandPhy.-AcuteInferiorMyocard

In [None]:
relation_df[relation_df['label']!=relation_df['predict_relation']].iloc[0]

file_name           112_Consult-HistoryandPhy.-Consult-RectalBleed...
text                SOCIAL HISTORY:  The patient is retired. He is...
Entity                                                         Family
Role                                                             Type
label                                                     No Relation
predict_relation                                             Relation
Name: 2, dtype: object

In [None]:
wrng_pred_file_nms=relation_df[relation_df['label']!=relation_df['predict_relation']]['file_name'].tolist()

In [None]:
set(wrng_pred_file_nms)

{'118_Consult-HistoryandPhy.-Consult-StasisUlcer_7',
 '124_Consult-HistoryandPhy.-Consult_ERReport-OB_GYN_8',
 '132_Consult-HistoryandPhy.-DetoxfromHeroin_7',
 '184_Consult-HistoryandPhy.-GastricBypassDiscussion-3_7',
 '190_Consult-HistoryandPhy.-GenMedConsult-13_8',
 '19_Consult-HistoryandPhy.-AnkleSprain-H&P_9',
 '204_Consult-HistoryandPhy.-GenMedConsult-26_9',
 '233_Consult-HistoryandPhy.-GenMedConsult-52_7',
 '259_Consult-HistoryandPhy.-Head&NeckCancerConsult_9',
 '264_Consult-HistoryandPhy.-Hematuria-ERVisit_7',
 '266_Consult-HistoryandPhy.-HipFracture-ERConsult_8',
 '285_Consult-HistoryandPhy.-IschemicCecum-Consult_8',
 '28_Consult-HistoryandPhy.-AttemptedSuicide-Consult_10',
 '290_Consult-HistoryandPhy.-Kyphoplasty-Consult_9',
 '301_Consult-HistoryandPhy.-LowBackPain-Consult_5',
 '303_Consult-HistoryandPhy.-LowerQuadrantPain_10',
 '326_Consult-HistoryandPhy.-Neck&BackPain_12',
 '405_Consult-HistoryandPhy.-PainManagementConsult-2_7',
 '407_Consult-HistoryandPhy.-PediatricRheumato

In [None]:
for e in missed_rel_perc_list:
  if e[0] in wrng_pred_file_nms:
    print(e)

['285_Consult-HistoryandPhy.-IschemicCecum-Consult_8', 5, 3]
['405_Consult-HistoryandPhy.-PainManagementConsult-2_7', 8, 3]
['419_Consult-HistoryandPhy.-PreopCardiacConsult_9', 7, 2]
['28_Consult-HistoryandPhy.-AttemptedSuicide-Consult_10', 6, 5]
['407_Consult-HistoryandPhy.-PediatricRheumatologyConsult_8', 4, 3]
['233_Consult-HistoryandPhy.-GenMedConsult-52_7', 1, 1]
['132_Consult-HistoryandPhy.-DetoxfromHeroin_7', 7, 4]
['8_Consult-HistoryandPhy.-AcuteInferiorMyocardialInfarction_7', 6, 3]
['524_Consult-HistoryandPhy.-AcuteInferiorMyocardialInfarction_7', 6, 3]
['290_Consult-HistoryandPhy.-Kyphoplasty-Consult_9', 4, 1]


In [None]:
relation_df[relation_df['file_name']=='285_Consult-HistoryandPhy.-IschemicCecum-Consult_8']

Unnamed: 0,file_name,text,Entity,Role,label,predict_relation
190,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long histor...,Tobacco,QuitHistory,Relation,Relation
191,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long histor...,Tobacco,Frequency,No Relation,No Relation
192,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long histor...,Alcohol,QuitHistory,No Relation,No Relation
193,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long histor...,Alcohol,Frequency,Relation,Relation
194,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long [SPAN...,Tobacco,Status,No Relation,Relation
195,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long [SPAN...,Alcohol,Status,No Relation,No Relation


In [None]:
relation_df[relation_df['label']!=relation_df['predict_relation']]


Unnamed: 0,file_name,text,Entity,Role,label,predict_relation
56,118_Consult-HistoryandPhy.-Consult-StasisUlcer_7,SOCIAL HISTORY: The patient admits [SPAN2_ST...,Alcohol,Status,No Relation,Relation
67,190_Consult-HistoryandPhy.-GenMedConsult-13_8,SOCIAL HISTORY: She is married. A 76-year-old ...,Tobacco,Status,No Relation,Relation
71,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Family,Type,No Relation,Relation
103,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Tobacco,Location,Relation,No Relation
150,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Tobacco,Status,No Relation,Relation
151,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Tobacco,Status,Relation,No Relation
194,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long [SPAN...,Tobacco,Status,No Relation,Relation
196,303_Consult-HistoryandPhy.-LowerQuadrantPain_10,SOCIAL HISTORY: The patient does not [SPAN1_S...,Tobacco,QuitHistory,Relation,No Relation
440,405_Consult-HistoryandPhy.-PainManagementConsu...,PERSONAL AND SOCIAL HISTORY: Marital status: M...,Alcohol,Amount,No Relation,Relation
453,405_Consult-HistoryandPhy.-PainManagementConsu...,PERSONAL AND SOCIAL HISTORY: Marital status: M...,Alcohol,Status,No Relation,Relation


In [None]:
Pos = relation_df[relation_df['label']=='Relation']
Neg = relation_df[relation_df['label']=='No Relation']

In [None]:
TP=Pos[Pos['label']==Pos['predict_relation']]


In [None]:
FN = Pos[Pos['label']!=Pos['predict_relation']]
FN

Unnamed: 0,file_name,text,Entity,Role,label,predict_relation
103,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Tobacco,Location,Relation,No Relation
151,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Tobacco,Status,Relation,No Relation
196,303_Consult-HistoryandPhy.-LowerQuadrantPain_10,SOCIAL HISTORY: The patient does not [SPAN1_S...,Tobacco,QuitHistory,Relation,No Relation
776,184_Consult-HistoryandPhy.-GastricBypassDiscus...,SOCIAL HISTORY: The patient is a life long no...,Family,Type,Relation,No Relation
791,407_Consult-HistoryandPhy.-PediatricRheumatolo...,SOCIAL HISTORY: He lives with [SPAN1_START] ...,Family,Type,Relation,No Relation
967,266_Consult-HistoryandPhy.-HipFracture-ERConsu...,SOCIAL HISTORY: Denies any tobacco or alcohol...,Family,Type,Relation,No Relation
1056,326_Consult-HistoryandPhy.-Neck&BackPain_12,SOCIAL HISTORY: The patient is retired. She is...,Family,Type,Relation,No Relation
1106,8_Consult-HistoryandPhy.-AcuteInferiorMyocardi...,SOCIAL HISTORY: [SPAN2_START] Denies [SPAN2_...,Tobacco,Status,Relation,No Relation
1177,524_Consult-HistoryandPhy.-AcuteInferiorMyocar...,SOCIAL HISTORY: [SPAN2_START] Denies [SPAN2_...,Tobacco,Status,Relation,No Relation
1340,204_Consult-HistoryandPhy.-GenMedConsult-26_9,"SOCIAL HISTORY: No tobacco, alcohol or illicit...",Family,Type,Relation,No Relation


In [None]:
FP=Neg[Neg['label']!=Neg['predict_relation']]
FP

Unnamed: 0,file_name,text,Entity,Role,label,predict_relation
56,118_Consult-HistoryandPhy.-Consult-StasisUlcer_7,SOCIAL HISTORY: The patient admits [SPAN2_ST...,Alcohol,Status,No Relation,Relation
67,190_Consult-HistoryandPhy.-GenMedConsult-13_8,SOCIAL HISTORY: She is married. A 76-year-old ...,Tobacco,Status,No Relation,Relation
71,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Family,Type,No Relation,Relation
150,259_Consult-HistoryandPhy.-Head&NeckCancerCons...,SOCIAL HISTORY: The patient is married but ha...,Tobacco,Status,No Relation,Relation
194,285_Consult-HistoryandPhy.-IschemicCecum-Consu...,SOCIAL HISTORY: The patient had a long [SPAN...,Tobacco,Status,No Relation,Relation
440,405_Consult-HistoryandPhy.-PainManagementConsu...,PERSONAL AND SOCIAL HISTORY: Marital status: M...,Alcohol,Amount,No Relation,Relation
453,405_Consult-HistoryandPhy.-PainManagementConsu...,PERSONAL AND SOCIAL HISTORY: Marital status: M...,Alcohol,Status,No Relation,Relation
484,419_Consult-HistoryandPhy.-PreopCardiacConsult_9,SOCIAL HISTORY: She used to smoke cigarettes a...,Family,Type,No Relation,Relation
485,419_Consult-HistoryandPhy.-PreopCardiacConsult_9,SOCIAL HISTORY: She [SPAN2_START] used to [SP...,Tobacco,Status,No Relation,Relation
486,419_Consult-HistoryandPhy.-PreopCardiacConsult_9,SOCIAL HISTORY: She used to [SPAN1_START] smo...,Tobacco,Status,No Relation,Relation


In [None]:
TN=Neg[Neg['label']==Neg['predict_relation1']]
TN

In [None]:
print(classification_report(relation_df['label'].tolist(), relation_df['predict_relation'].tolist()))

              precision    recall  f1-score   support

 No Relation       0.99      0.96      0.98      1172
    Relation       0.85      0.96      0.90       261

    accuracy                           0.96      1433
   macro avg       0.92      0.96      0.94      1433
weighted avg       0.97      0.96      0.96      1433



In [None]:
print(classification_report(relation_df['label'].tolist(), relation_df['predict_relation'].tolist()))

              precision    recall  f1-score   support

 No Relation       0.99      0.96      0.98      1172
    Relation       0.85      0.96      0.90       261

    accuracy                           0.96      1433
   macro avg       0.92      0.96      0.94      1433
weighted avg       0.97      0.96      0.96      1433

