# Install packages

This version achieves a higher F1 score than v1 by assigning tag "X" to the splitted words that are not in the vocabulary of the "bert-base-uncased" model, therefore resolving the mismatch between word and corresponding labels. However, because of the newly introduced tag "X" and the large number of science "jargons" that the tokenizer does not recognizes, there are a lot "X" tags being present in the modified labels and the model is over-predicting them. (more false positives judging from the high recall and low precision score for "X" in the classification report)

In [0]:
!pip3 install pytorch-transformers

In [0]:
!pip3 install seqeval

In [0]:
!pip3 install spacy

In [0]:
!pip3 install sklearn-crfsuite

In [5]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm, trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig, BertForTokenClassification, AdamW
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score

Using TensorFlow backend.


# Load data

In [0]:
url = "https://raw.githubusercontent.com/rpw199912j/MatBERT/master/mat_ner_IOB_underscore.csv"
data = pd.read_csv(url).fillna("O")

In [7]:
data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,1,Magnetic,O
1,1,and,O
2,1,thermoelectric,O
3,1,properties,O
4,1,of,O
5,1,the,O
6,1,ternary,O
7,1,pseudo-hollandite,O
8,1,BaxCr5Se8,B-target
9,1,(,O


# Pre-processing
## Get the sentece and labels

In [0]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        self.grouped = self.data.groupby("Sentence #").apply(
            lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                              s["Tag"].values.tolist())])
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
getter = SentenceGetter(data)
print(getter.grouped)

Sentence #
1       [(Magnetic, O), (and, O), (thermoelectric, O),...
2       [(In, O), (order, O), (to, O), (prevent, O), (...
3       [(The, O), (elements, O), (,, O), (Ba, B-precu...
4       [(The, O), (synthesis, O), (of, O), (the, O), ...
5       [(BaSe, B-material), (was, O), (prepared, B-op...
6       [(The, O), (mechanical, B-operation), (alloyin...
7       [(BaxCr5Se8, B-target), (was, O), (then, O), (...
8       [(In, O), (order, O), (to, O), (obtain, O), (5...
9       [(The, O), (mixture, B-unspecified_material), ...
10      [(The, O), (powder, B-unspecified_material), (...
11      [(About, O), (5, B-number), (g, B-amount_unit)...
12      [(The, O), (temperature, O), (was, O), (raised...
13      [(The, O), (pressure, O), (was, O), (raised, B...
14      [(Insights, O), (into, O), (the, O), (stabilit...
15      [(Au, B-target), (was, O), (deposited, B-opera...
16      [(This, O), (solution, B-unspecified_material)...
17      [(After, O), (stirring, B-operation), (for, O)...
18 

### Take a look at the first sentence in the data

In [10]:
sentences = [" ".join([s[0].lower() for s in sent]) for sent in getter.sentences]
print(sentences[0])

magnetic and thermoelectric properties of the ternary pseudo-hollandite baxcr5se8 ( 0.5 < x < 0.55 ) solid solution .


### Get the word-level label

In [11]:
labels = [[s[1] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-target', 'O', 'B-number', 'O', 'O', 'O', 'B-number', 'O', 'O', 'O', 'O']


Create a dictionary that maps each word label into a number

In [12]:
tags_vals = list(set(data["Tag"].values))
tags_vals.append("X")
print(tags_vals)
tag2idx = {t: i for i, t in enumerate(tags_vals)}
print(tag2idx)

['I-apparatus_property_type', 'B-apparatus_descriptor', 'B-amount_misc', 'B-number', 'I-meta', 'I-number', 'I-characterization_apparatus', 'B-operation', 'B-brand', 'I-condition_misc', 'B-gas', 'B-target', 'B-condition_type', 'I-material', 'I-condition_type', 'B-reference', 'B-precursor', 'B-amount_unit', 'I-amount_misc', 'I-reference', 'B-nonrecipe_material', 'B-property_type', 'B-synthesis_apparatus', 'I-precursor', 'I-target', 'B-unspecified_material', 'I-gas', 'B-condition_unit', 'I-brand', 'B-property_misc', 'I-apparatus_unit', 'I-synthesis_apparatus', 'B-apparatus_unit', 'B-material_descriptor', 'B-condition_misc', 'I-property_unit', 'B-solvent', 'I-condition_unit', 'B-material', 'I-solvent', 'I-operation', 'B-meta', 'I-property_misc', 'I-apparatus_descriptor', 'B-apparatus_property_type', 'B-property_unit', 'I-unspecified_material', 'I-amount_unit', 'I-property_type', 'B-characterization_apparatus', 'O', 'I-nonrecipe_material', 'I-material_descriptor', 'X']
{'I-apparatus_propert

# Apply BERT model
## Set constants and GPU processor

In [0]:
MAX_LEN = 64
BATCH_SIZE = 32

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("The number of GPU: {}".format(n_gpu))

The number of GPU: 1


In [15]:
torch.cuda.get_device_name(0) 

'Tesla K80'

## Get the pre-trained uncased word embeddings

In [0]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
# tokenizer.add_tokens(["stirrer", "teflon", "autoclave", "degc"])

## Tokenize all the sentences

In [17]:
tokenizer.tokenize("stirrer")

['stir', '##rer']

In [18]:
tokenized_text = []
mylabels = []
for sent, tags in zip(sentences,labels):
  BERT_texts = []
  BERT_labels = np.array([])
  for word, tag in zip(sent.split(),tags):
    sub_words = tokenizer.wordpiece_tokenizer.tokenize(word)
    tags = np.array([tag for x in sub_words])
    tags[1:] = "X"
    BERT_texts += sub_words
    BERT_labels = np.append(BERT_labels,tags)
  tokenized_text.append(BERT_texts)
  mylabels.append(list(BERT_labels))

print(tokenized_text[1431])
print(mylabels[1431])

['after', 'further', 'stirring', 'with', 'a', 'magnetic', 'stir', '##rer', 'for', '6', 'h', 'at', 'room', 'temperature', ',', 'the', 'gel', 'mixture', 'was', 'transferred', 'to', 'a', 'te', '##fl', '##on', '-', 'lined', 'stainless', '-', 'steel', 'auto', '##clave', 'and', 'heated', 'at', '140', 'de', '##gc', 'for', '9', 'days', 'under', 'tumbling', '(', '60', 'rpm', ')', '.']
['O', 'O', 'B-operation', 'O', 'O', 'B-synthesis_apparatus', 'I-synthesis_apparatus', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-condition_misc', 'I-condition_misc', 'O', 'O', 'B-unspecified_material', 'I-unspecified_material', 'O', 'B-operation', 'O', 'O', 'B-apparatus_descriptor', 'X', 'X', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'B-synthesis_apparatus', 'X', 'O', 'B-operation', 'O', 'B-number', 'B-condition_unit', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-operation', 'O', 'B-number', 'B-condition_unit', 'O',

In [0]:
# print(sentences[1431])
# tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]
# print(tokenized_text[0])
# print(labels[0])
# print(len(labels[0]))

### Pad all the tokenized sentences and labels to the same length

In [20]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_text],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in mylabels],
                     maxlen=MAX_LEN, value=tag2idx["O"], dtype="long", truncating="post", padding="post")

print(input_ids[0])
print(tags[0])

[ 8060  1998  1996 10867  8913  2571 22601  5144  1997  1996 28774 24041
 18404 29624 14854  3122  4221  8670  2595 26775  2629  3366  2620  1006
  1014 29625  2629  1026  1060  1026  1014 29625 24087  1007  5024  5576
  1012     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[50 50 50 53 53 53 53 50 50 50 50 53 50 53 53 53 53 11 53 53 53 53 53 50
  3 53 53 50 50 50  3 53 53 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50]


### Create attention masks for the attention model

In [21]:
attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]
print(attention_masks)

[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.

## Split the data into 90% training set and 10% validation set

In [0]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

## Convert the data into Torch tensor format for later processing

In [0]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

### Define the training and validation data in the DataLoader for NLP model

In [0]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

# Finetuning the BERT model

In [0]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
# model.resize_token_embeddings(len(tokenizer))

## Load the data into GPU

In [26]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate):

In [0]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
# USE ADAM for gradient descent
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

## Define metrics for finetuning

In [0]:
from seqeval.metrics import f1_score, classification_report

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
def flat_accuracy_no_x(preds, labels):
  pred_flat = np.argmax(preds, axis=2).flatten()
  labels_flat = labels.flatten()
  labels_non_x_indices = (labels_flat != tag2idx["X"])
  labels_non_x = labels_flat[labels_non_x_indices]
  pred_non_x = pred_flat[labels_non_x_indices]
  return np.sum(pred_non_x == labels_non_x) / len(labels_non_x)

In [0]:
epochs = 5
max_grad_norm = 1.0

In [35]:
counter = 0
s_labels = None
for batch in train_dataloader:
  if counter > 0:
    break
  _,_, s_labels = batch
  counter += 1
print(s_labels)

tensor([[50, 50, 50,  ..., 50, 50, 50],
        [50, 50, 50,  ..., 50, 50, 50],
        [50, 50,  1,  ..., 50, 50, 50],
        ...,
        [50, 22, 50,  ..., 50, 50, 50],
        [11, 53, 50,  ..., 50, 50, 50],
        [50, 25, 50,  ..., 50, 50, 50]])


In [36]:
for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # Use batch training to speed up training speed
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
#         b_labels[b_labels == tag2idx["X"]] = -1
        # Forward-prop pass and loss computing
        loss, _ = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # Back-prop
        loss.backward()
        # track training loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # Gradient clipping to prevent gradient explosion
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), 
                                       max_norm=max_grad_norm)
        # Update parameters
        optimizer.step()
        model.zero_grad()
    print("Avg Training Loss Per Epoch: {}".format(tr_loss/nb_tr_steps))

    # Validation
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
#         b_labels[b_labels == tag2idx["X"]] = -1
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
            tmp_eval_loss, logits = outputs[:2]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to("cpu").numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)

        tmp_eval_accuracy = flat_accuracy_no_x(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1 score: {}".format(f1_score(pred_tags, valid_tags)))


Epoch:   0%|          | 0/5 [00:00<?, ?it/s][A

Avg Training Loss Per Epoch: 0.7151541446076065



Epoch:  20%|██        | 1/5 [00:52<03:28, 52.16s/it][A

Validation loss: 0.5745105275085994
Validation accuracy: 0.516941509055956
F1 score: 0.6015037593984961
Avg Training Loss Per Epoch: 0.4845190233871585



Epoch:  40%|████      | 2/5 [01:44<02:36, 52.14s/it][A

Validation loss: 0.4408537915774754
Validation accuracy: 0.5121327931880274
F1 score: 0.6411807403822889
Avg Training Loss Per Epoch: 0.35937043829042403



Epoch:  60%|██████    | 3/5 [02:36<01:44, 52.13s/it][A

Validation loss: 0.3946198395320347
Validation accuracy: 0.47884085262399595
F1 score: 0.6675613029157008
Avg Training Loss Per Epoch: 0.27729045807338154



Epoch:  80%|████████  | 4/5 [03:28<00:52, 52.10s/it][A

Validation loss: 0.36052372200148447
Validation accuracy: 0.5205802067928297
F1 score: 0.6927050522867582
Avg Training Loss Per Epoch: 0.21796557453812146



Epoch: 100%|██████████| 5/5 [04:20<00:00, 52.08s/it][A
[A

Validation loss: 0.3758720798151834
Validation accuracy: 0.4986781833235202
F1 score: 0.6710971483808602


## Evaluate the model

In [37]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
#     print(b_input_ids)
#     print(b_input_ids[1,:].tolist())
    with torch.no_grad():
        tmp_eval_loss, logits = model(b_input_ids, token_type_ids=None,
                                      attention_mask=b_input_mask, labels=b_labels)[:2]
        
    logits = logits.detach().cpu().numpy()
#     print(logits)
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy_no_x(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Validation loss: 0.3758720798151834
Validation Accuracy: 0.4986781833235202
Validation F1-Score: 0.6710971483808602


In [0]:
valid_tags[0] == "X"

In [38]:
print("{}".format(classification_report(valid_tags, pred_tags)))

                            precision    recall  f1-score   support

                     brand       0.58      0.81      0.67        52
                    number       0.80      0.98      0.88       363
       material_descriptor       0.25      0.75      0.37       146
                 operation       0.59      0.91      0.72       388
                         X       0.65      1.00      0.79      1002
             property_misc       0.37      0.38      0.38        73
                  material       0.19      0.30      0.24        79
            condition_unit       0.79      0.96      0.87       160
       synthesis_apparatus       0.36      0.67      0.47        48
               amount_unit       0.77      0.97      0.86       135
               amount_misc       0.27      0.68      0.39        22
                      meta       0.40      0.60      0.48        30
                 precursor       0.72      0.76      0.74       180
        nonrecipe_material       0.58      0.51

In [39]:
tags_vals.remove("O")
tags_vals.remove("X")
sorted_labels = sorted(tags_vals)
print("{}".format(flat_classification_report(valid_tags, pred_tags, labels=sorted_labels)))

                              precision    recall  f1-score   support

               B-amount_misc       0.27      0.71      0.39        17
               B-amount_unit       0.84      0.98      0.90       121
      B-apparatus_descriptor       0.42      0.57      0.48        14
   B-apparatus_property_type       0.00      0.00      0.00         2
            B-apparatus_unit       0.78      0.78      0.78         9
                     B-brand       0.73      0.88      0.80        34
B-characterization_apparatus       0.57      0.53      0.55        15
            B-condition_misc       0.59      0.87      0.70        61
            B-condition_type       0.27      0.50      0.35        12
            B-condition_unit       0.79      0.97      0.87       154
                       B-gas       0.66      0.88      0.75        26
                  B-material       0.23      0.29      0.26        69
       B-material_descriptor       0.28      0.83      0.42       138
                   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [46]:
print(pred_tags[1][:43])

['O', 'O', 'B-operation', 'O', 'O', 'B-synthesis_apparatus', 'B-synthesis_apparatus', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-condition_misc', 'I-condition_misc', 'O', 'O', 'B-unspecified_material', 'B-unspecified_material', 'O', 'B-operation', 'O', 'O', 'B-apparatus_descriptor', 'X', 'X', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'B-synthesis_apparatus', 'X', 'O', 'B-operation', 'O', 'B-number', 'B-condition_unit', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-operation']


In [47]:
print(valid_tags[1][:43])

['O', 'O', 'B-operation', 'O', 'O', 'B-synthesis_apparatus', 'I-synthesis_apparatus', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-condition_misc', 'I-condition_misc', 'O', 'O', 'B-unspecified_material', 'I-unspecified_material', 'O', 'B-operation', 'O', 'O', 'B-apparatus_descriptor', 'X', 'X', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'B-synthesis_apparatus', 'X', 'O', 'B-operation', 'O', 'B-number', 'B-condition_unit', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-operation']


In [0]:
ids = [2044, 2582, 18385, 2007, 1037, 8060, 16130, 14544, 2005, 1020, 1044, 2012, 2282, 4860, 1010, 1996, 21500, 8150, 2001, 4015, 2000, 1037, 8915, 10258, 2239, 1011, 7732, 18676, 1011, 3886, 8285, 23650, 1998, 9685, 2012, 8574, 2139, 18195, 2005, 1023, 2420, 2104, 21552, 1006, 3438, 11575, 1007, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [43]:
print(tokenizer.convert_ids_to_tokens(ids[:43]))

['after', 'further', 'stirring', 'with', 'a', 'magnetic', 'stir', '##rer', 'for', '6', 'h', 'at', 'room', 'temperature', ',', 'the', 'gel', 'mixture', 'was', 'transferred', 'to', 'a', 'te', '##fl', '##on', '-', 'lined', 'stainless', '-', 'steel', 'auto', '##clave', 'and', 'heated', 'at', '140', 'de', '##gc', 'for', '9', 'days', 'under', 'tumbling']


# NER Visualization

In [1]:
from spacy import displacy

In [48]:
tags_uppercase = [tag.upper() for tag in tags_vals]
print(tags_uppercase)
print(len(tags_uppercase))

['I-APPARATUS_PROPERTY_TYPE', 'B-APPARATUS_DESCRIPTOR', 'B-AMOUNT_MISC', 'B-NUMBER', 'I-META', 'I-NUMBER', 'I-CHARACTERIZATION_APPARATUS', 'B-OPERATION', 'B-BRAND', 'I-CONDITION_MISC', 'B-GAS', 'B-TARGET', 'B-CONDITION_TYPE', 'I-MATERIAL', 'I-CONDITION_TYPE', 'B-REFERENCE', 'B-PRECURSOR', 'B-AMOUNT_UNIT', 'I-AMOUNT_MISC', 'I-REFERENCE', 'B-NONRECIPE_MATERIAL', 'B-PROPERTY_TYPE', 'B-SYNTHESIS_APPARATUS', 'I-PRECURSOR', 'I-TARGET', 'B-UNSPECIFIED_MATERIAL', 'I-GAS', 'B-CONDITION_UNIT', 'I-BRAND', 'B-PROPERTY_MISC', 'I-APPARATUS_UNIT', 'I-SYNTHESIS_APPARATUS', 'B-APPARATUS_UNIT', 'B-MATERIAL_DESCRIPTOR', 'B-CONDITION_MISC', 'I-PROPERTY_UNIT', 'B-SOLVENT', 'I-CONDITION_UNIT', 'B-MATERIAL', 'I-SOLVENT', 'I-OPERATION', 'B-META', 'I-PROPERTY_MISC', 'I-APPARATUS_DESCRIPTOR', 'B-APPARATUS_PROPERTY_TYPE', 'B-PROPERTY_UNIT', 'I-UNSPECIFIED_MATERIAL', 'I-AMOUNT_UNIT', 'I-PROPERTY_TYPE', 'B-CHARACTERIZATION_APPARATUS', 'I-NONRECIPE_MATERIAL', 'I-MATERIAL_DESCRIPTOR']
52


In [2]:
COLORS = {"AMOUNT_MISC": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "NUMBER": "linear-gradient(90deg, orange, cyan)",
          "AMOUNT_UNIT": "linear-gradient(90deg, red, orange)",
          "PROPERTY_MISC": "linear-gradient(90deg, purple 40%, yellow)",
          "MATERIAL": "#aa9cfc",
          "NONRECIPE_MATERIAL": "red",
          "TARGET": "#a4893d",
          "META": "yellow",
          "UNSPECIFIED_MATERIAL": "#0074D9",
          "APPARATUS_UNIT": "linear-gradient(90deg, #e66465, #9198e5)",
          "MATERIAL_DESCRIPTOR": "#9198e5",
          "SOLVENT": "#e66465",
          "PROPERTY_TYPE": "brown",
          "PRECURSOR": "pink",
          "CONDITION_MISC": "#fc9ce7",
          "APPARATUS_PROPERTY_TYPE": "orange",
          "PROPERTY_UNIT": "linear-gradient(217deg, rgba(255,0,0,.8), rgba(255,0,0,0) 70.71%)",
          "CONDITION_UNIT": "linear-gradient(217deg, rgba(800,0,0,.8), yellow 70.71%)",
          "APPARATUS_DESCRIPTOR": "#fea49f",
          "SYNTHESIS_APPARATUS": "#bf4aa8",
          "OPERATION": "#9e363a",
          "CHARACTERIZATION_APPARATUS": "#4f5f76",
          "BRAND": "#e4decd",
          "CONDITION_TYPE": "#8bf0ba",
          "GAS": "#ffdc6a",
          "REFERENCE": "#feda6a"
          }

In [3]:
def ner_visualize(sentence, tags, colors=COLORS):
    sentence_concat = " ".join(sentence)
    ents = []
    start = 0
    end = 0
    for word, tag in zip(sentence, tags):
        end = start + len(word) - 1
        ents.append({"start": start, "end": end+1,
                     "label": tag[2:].upper() if tag != "O" else tag.upper()})
        start = end + 2

    test = [{"text": sentence_concat,
             "ents": ents,
             "title": None}]

    options = {"ents": [tag[2:].upper() for tag in set(tags) if tag not in ["O"]], "colors": colors}
    displacy.render(test, style="ent", manual=True, options=options)

## Example labels

In [4]:
text = ['after', 'further', 'stirring', 'with', 'a', 'magnetic', 'stir', '##rer', 'for', '6', 'h', 'at', 'room', 'temperature', ',', 'the', 'gel', 'mixture', 'was', 'transferred', 'to', 'a', 'te', '##fl', '##on', '-', 'lined', 'stainless', '-', 'steel', 'auto', '##clave', 'and', 'heated', 'at', '140', 'de', '##gc', 'for', '9', 'days', 'under', 'tumbling']
true_labels = ['O', 'O', 'B-operation', 'O', 'O', 'B-synthesis_apparatus', 'I-synthesis_apparatus', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-condition_misc', 'I-condition_misc', 'O', 'O', 'B-unspecified_material', 'I-unspecified_material', 'O', 'B-operation', 'O', 'O', 'B-apparatus_descriptor', 'X', 'X', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'B-synthesis_apparatus', 'X', 'O', 'B-operation', 'O', 'B-number', 'B-condition_unit', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-operation']
pred_labels = ['O', 'O', 'B-operation', 'O', 'O', 'B-synthesis_apparatus', 'B-synthesis_apparatus', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-condition_misc', 'I-condition_misc', 'O', 'O', 'B-unspecified_material', 'B-unspecified_material', 'O', 'B-operation', 'O', 'O', 'B-apparatus_descriptor', 'X', 'X', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'I-apparatus_descriptor', 'B-synthesis_apparatus', 'X', 'O', 'B-operation', 'O', 'B-number', 'B-condition_unit', 'X', 'O', 'B-number', 'B-condition_unit', 'O', 'B-operation']

### True labels

In [5]:
ner_visualize(text, true_labels)

### Predicted labels
The color coding shows that the model makes a perfect prediction for this sentence, but in reality, the model makes a wrong prediction for the word "stir" in the model.

In [6]:
ner_visualize(text, pred_labels)