# Load 'NER Model' from directory and evaluate the model. 
# This model takes imput as string.   

In [1]:
import copy
from transformers import BertForTokenClassification, BertTokenizer
from transformers import AutoConfig, AutoTokenizer, BertForSequenceClassification
import stanza
import torch
from scipy.special import softmax
import numpy as np

In [2]:
tag2idx={'B-problem': 0,
 'B-test': 1,
 'B-treatment': 2,
 'I-problem': 3,
 'I-test': 4,
 'I-treatment': 5,
 'O': 6,
 'X': 7,
 '[CLS]': 8,
 '[SEP]': 9
 }
# Mapping index to name
tag2name = {tag2idx[key]: key for key in tag2idx}

In [3]:
def load_ner_model():
    num_labels = len(tag2idx)
    save_model_address = '../trained_models/NER/C-Bert-test'
    model = BertForTokenClassification.from_pretrained(
        save_model_address, num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained(
        save_model_address, do_lower_case=False)
    return model, tokenizer

In [4]:
model_ner, tokenizer_ner = load_ner_model()

In [5]:
try:
    nlp = stanza.Pipeline(lang="en", processors="tokenize")
except Exception:
    stanza.download("en")
    nlp = stanza.Pipeline(lang="en", processors="tokenize")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-08-06 00:33:55 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2022-08-06 00:33:55 INFO: Use device: cpu
2022-08-06 00:33:55 INFO: Loading: tokenize
2022-08-06 00:33:55 INFO: Done loading processors!


In [6]:
MAX_LEN = 128

In [7]:
def create_query(sentence, tokenizer):

    temp_token = ['[CLS]']
    # word_list = [token.text for token in sentence.tokens]
    for word in sentence:
        temp_token.extend(tokenizer.tokenize(word))
    temp_token = temp_token[:128 - 1]
    temp_token.append('[SEP]')
    input_id = tokenizer.convert_tokens_to_ids(temp_token)
    padding_len = MAX_LEN - len(input_id)
    input_id = input_id + ([0] * padding_len)
    tokenized_texts = [input_id]
    attention_masks = [[int(i>0) for i in input_id]]

    return temp_token, torch.tensor(tokenized_texts), torch.tensor(attention_masks)

In [8]:
def model_inference(model, input_ids):
    #model.to(device)
    #input_ids = input_ids.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
                        attention_mask=None)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0]
    # Get NER predict result
    predict_results = logits.detach().cpu().numpy()
    result_arrays_soft = softmax(predict_results[0])

    return np.argmax(result_arrays_soft, axis=-1)

In [9]:
def predict_entities(long_text):
    all_sentences = []
    all_tags = []
    doc = nlp(long_text)
    for i, sentence in enumerate(doc.sentences):
        # temp_token: tokenized words
        # input_ids: convert temp_token to id
        word_list = [token.text for token in sentence.tokens]
        temp_token, input_ids, attention_masks = create_query(word_list, tokenizer_ner)
        result_list = model_inference(model_ner, input_ids)
        result = [tag2name[t] for t in result_list]
        pretok_sent = ""
        pretags = ""
        for i, tok in enumerate(temp_token):
            if tok.startswith("##"):
                pretok_sent += tok[2:]
            else:
                pretok_sent += f" {tok}"
                pretags += f" {result[i]}"
        pretok_sent = pretok_sent[1:]
        pretags = pretags[1:]
        s = pretok_sent.split()
        t = pretags.split()
        all_sentences.append(s)
        all_tags.append(t)
    return all_sentences, all_tags

# Insert sentences in string format. 

In [10]:
input_text = "He had no cardiac murmur ."

In [11]:
input_text = "Significant for chronic atrial fibrillation , managed with an Italian variation of Digoxin , and Amioadrone , hypertension managed with an ACE inhibitor , chronic obstructive pulmonary disease managed with a zanthine preparation and low dose steroids , as well as an occasional inhaler"

In [12]:
all_sentences, all_tags = predict_entities (input_text)

print (all_sentences)
print (all_tags)


[['[CLS]', 'Significant', 'for', 'chronic', 'atrial', 'fibrillation', ',', 'managed', 'with', 'an', 'Italian', 'variation', 'of', 'Digoxin', ',', 'and', 'Amioadrone', ',', 'hypertension', 'managed', 'with', 'an', 'ACE', 'inhibitor', ',', 'chronic', 'obstructive', 'pulmonary', 'disease', 'managed', 'with', 'a', 'zanthine', 'preparation', 'and', 'low', 'dose', 'steroids', ',', 'as', 'well', 'as', 'an', 'occasional', 'inhaler', '[SEP]']]
[['O', 'O', 'O', 'B-problem', 'I-problem', 'I-problem', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-treatment', 'O', 'O', 'B-treatment', 'O', 'B-problem', 'O', 'O', 'B-treatment', 'I-treatment', 'I-treatment', 'O', 'B-problem', 'I-problem', 'I-problem', 'I-problem', 'O', 'O', 'B-treatment', 'I-treatment', 'O', 'O', 'B-treatment', 'I-treatment', 'I-treatment', 'O', 'O', 'O', 'O', 'B-treatment', 'I-problem', 'I-treatment', '[SEP]']]
