In [7]:
!pip install seqeval
!pip install transformers



In [8]:
import os
import pandas as pd
import math
import numpy as np
from tqdm import tqdm, trange
from seqeval.metrics import classification_report, accuracy_score, f1_score
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoModel, AutoConfig, AutoTokenizer
from transformers import AdamW
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification

In [9]:
# no of classifier: present, not-present
num_labels = 3
MODEL_CLASSES = {
  'bert': (AutoConfig, BertForSequenceClassification, AutoTokenizer),
}
MODEL_ADDRESS = 'emilyalsentzer/Bio_ClinicalBERT'
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
model_config = config_class.from_pretrained(MODEL_ADDRESS, num_labels=num_labels)
tokenizer = tokenizer_class.from_pretrained(MODEL_ADDRESS, do_lower_case=False)
model = model_class.from_pretrained(MODEL_ADDRESS, config=model_config)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

In [10]:
def modify_label(label):
    if label == 'not-present':
        return 1
    elif label == 'present':
        return 0

In [11]:
import time
import datetime

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [13]:
output_dir = './trained_models/Assertion/3_lable_model'
model = model_class.from_pretrained(output_dir)
tokenizer = tokenizer_class.from_pretrained(output_dir)

# Copy the model to the GPU.
# model.to(device)

**Predict with model**

In [14]:
sentence1 = 'Patient has [entity] fever [entity].'
sentence2 = 'Patient denies [entity] fever [entity].'
sentences = [sentence1, sentence2]
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    input_ids.append(encoded_dict['input_ids'])

    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(labels)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)


In [16]:
predictions , true_labels = [], []

In [20]:
model.eval()

with torch.no_grad():
    result = model(input_ids, token_type_ids=None,
                   attention_mask=attention_masks, return_dict=True)

logits = result.logits
logits = logits.detach().cpu().numpy()
predictions.append(logits)

# print('sentences: ', sentences)
pred_labels_i = np.argmax(logits, axis=1).flatten()
# print('Label prediction: ', pred_labels_i)

for index, sentence in enumerate(sentences):
    print(sentence)
    print(pred_labels_i[index])
    if pred_labels_i[index] == 0:
        print('Present')
    elif pred_labels_i[index] == 1:
        print('Possible')
    elif pred_labels_i[index] == 2:
        print('Not-present')


Patient has [entity] fever [entity].
0
Present
Patient denies [entity] fever [entity].
2
Present
