# Load modules and model

In [1]:
# !pip3 install --upgrade pip
# !pip3 install pytorch-transformers
# !pip3 install spacy

In [2]:
import os
import torch
import numpy as np
import pandas as pd
from spacy import displacy
from keras.preprocessing.sequence import pad_sequences
from pytorch_transformers import BertTokenizer, BertConfig, BertForTokenClassification

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
MODEL_DIR = "./ner_models/model_1/"
MAX_LEN = 64

In [4]:
model = BertForTokenClassification.from_pretrained(MODEL_DIR)
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR, do_lower_case=True)
tags_vals = ['B-brand', 'B-reference', 'I-property_type', 'I-operation', 'B-precursor', 'I-unspecified_material', 
             'I-property_misc', 'B-synthesis_apparatus', 'B-target', 'B-characterization_apparatus', 'I-synthesis_apparatus', 
             'B-number', 'B-meta', 'I-property_unit', 'I-number', 'I-material', 'B-unspecified_material', 'B-condition_unit', 'I-solvent', 
             'B-amount_misc', 'I-amount_misc', 'B-solvent', 'B-nonrecipe_material', 'B-condition_misc', 'I-apparatus_unit', 
             'I-apparatus_descriptor', 'B-operation', 'B-material', 'I-condition_misc', 'I-nonrecipe_material', 'O', 'B-apparatus_unit', 
             'I-meta', 'I-apparatus_property_type', 'I-amount_unit', 'I-brand', 'B-property_misc', 'B-amount_unit', 'B-apparatus_property_type', 
             'I-material_descriptor', 'I-condition_unit', 'B-property_unit', 'B-condition_type', 'B-apparatus_descriptor', 
             'B-material_descriptor', 'I-reference', 'B-property_type', 'I-gas', 'I-precursor', 'I-condition_type', 'I-characterization_apparatus', 
             'B-gas', 'I-target', 'X']

# Make predictions

In [5]:
def predict(untokenized_text, model, tokenizer, tags_vals):
    predictions = []
    
    tokenized_text = [tokenizer.tokenize(sentence) for sentence in untokenized_text]
    tokenized_length = [len(sent) for sent in tokenized_text]
#     print(tokenized_text)
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_text], 
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
#     print(input_ids)
    attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]
#     print(attention_masks)
    
    inputs_tensor = torch.tensor(input_ids)
    attention_tensor = torch.tensor(attention_masks)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    token_tensor = inputs_tensor.to(device)
    attention_masks = attention_tensor.to(device)
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        outputs = model(token_tensor, attention_mask=attention_masks)[:2]
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
#     print(predictions)
    prediction_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
    prediction_tags = [sent[:end_number] for sent, end_number in zip(prediction_tags, tokenized_length)]
    
    return {"tags": prediction_tags,
            "original_sent": tokenized_text}

In [6]:
input_sentences = ["""The addition of a small amount of CuO can effectively lower the sintering 
                   temperature of LLZWO to 1120 °C and reduce sintering time to 6 h.""", 
                   """However, LLZO samples prepared via conventional ambient air sintering reported in the published 
                   literature often contain large grains with lower than desired (<94%) relative density."""]

In [7]:
pred = predict(input_sentences, model, tokenizer, tags_vals)

In [8]:
print(pred["tags"][0])
print(pred["original_sent"][0])

['O', 'B-operation', 'O', 'O', 'O', 'I-amount_misc', 'O', 'B-target', 'X', 'O', 'O', 'O', 'O', 'B-operation', 'X', 'I-condition_misc', 'O', 'X', 'X', 'X', 'O', 'O', 'X', 'X', 'X', 'O', 'O', 'B-operation', 'X', 'O', 'O', 'B-number', 'B-condition_unit', 'O']
['the', 'addition', 'of', 'a', 'small', 'amount', 'of', 'cu', '##o', 'can', 'effectively', 'lower', 'the', 'sin', '##tering', 'temperature', 'of', 'll', '##zw', '##o', 'to', '112', '##0', '°', '##c', 'and', 'reduce', 'sin', '##tering', 'time', 'to', '6', 'h', '.']


In [9]:
def give_table(pred):
    tags = pred["tags"]
    sents = pred["original_sent"]
    tags = [label for tag in tags for label in tag]
    sents = [word for words in sents for word in words]
    tuples = list(zip(sents, tags))
    df = pd.DataFrame(tuples, columns=["Word", "Label"], index=None)
    return {"df":df,
           "words": sents,
           "tags": tags}

In [10]:
results = give_table(pred)
print(results["df"])

            Word                   Label
0            the                       O
1       addition             B-operation
2             of                       O
3              a                       O
4          small                       O
5         amount           I-amount_misc
6             of                       O
7             cu                B-target
8            ##o                       X
9            can                       O
10   effectively                       O
11         lower                       O
12           the                       O
13           sin             B-operation
14      ##tering                       X
15   temperature        I-condition_misc
16            of                       O
17            ll                       X
18          ##zw                       X
19           ##o                       X
20            to                       O
21           112                       O
22           ##0                       X
23             °

# Visualization of tagging

In [11]:
COLORS = {"AMOUNT_MISC": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "NUMBER": "linear-gradient(90deg, orange, cyan)",
          "AMOUNT_UNIT": "linear-gradient(90deg, red, orange)",
          "PROPERTY_MISC": "linear-gradient(90deg, purple 40%, yellow)",
          "MATERIAL": "#aa9cfc",
          "NONRECIPE_MATERIAL": "red",
          "TARGET": "#a4893d",
          "META": "yellow",
          "UNSPECIFIED_MATERIAL": "#0074D9",
          "APPARATUS_UNIT": "linear-gradient(90deg, #e66465, #9198e5)",
          "MATERIAL_DESCRIPTOR": "#9198e5",
          "SOLVENT": "#e66465",
          "PROPERTY_TYPE": "brown",
          "PRECURSOR": "pink",
          "CONDITION_MISC": "#fc9ce7",
          "APPARATUS_PROPERTY_TYPE": "orange",
          "PROPERTY_UNIT": "linear-gradient(217deg, rgba(255,0,0,.8), rgba(255,0,0,0) 70.71%)",
          "CONDITION_UNIT": "linear-gradient(217deg, rgba(800,0,0,.8), yellow 70.71%)",
          "APPARATUS_DESCRIPTOR": "#fea49f",
          "SYNTHESIS_APPARATUS": "#bf4aa8",
          "OPERATION": "#ce9aff",
          "CHARACTERIZATION_APPARATUS": "#4f5f76",
          "BRAND": "#e4decd",
          "CONDITION_TYPE": "#8bf0ba",
          "GAS": "#ffdc6a",
          "REFERENCE": "#feda6a"
          }

In [12]:
def ner_visualize(sentence, tags, colors=COLORS):
    sentence_concat = " ".join(sentence)
    ents = []
    start = 0
    end = 0
    for word, tag in zip(sentence, tags):
        end = start + len(word) - 1
        ents.append({"start": start, "end": end+1,
                     "label": tag[2:].upper() if tag != "O" else tag.upper()})
        start = end + 2

    test = [{"text": sentence_concat,
             "ents": ents,
             "title": None}]

    options = {"ents": [tag[2:].upper() for tag in set(tags) if tag not in ["O"]], "colors": colors}
    displacy.render(test, style="ent", manual=True, options=options)

In [13]:
ner_visualize(results["words"], results["tags"])