# Install packages

In [0]:
!pip3 install pytorch-transformers

In [0]:
!pip3 install seqeval

In [0]:
!pip3 install spacy

In [4]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm, trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig, BertForTokenClassification, AdamW

Using TensorFlow backend.


# Load data

In [0]:
url = "https://raw.githubusercontent.com/rpw199912j/MatBERT/master/mat_ner.csv"
data = pd.read_csv(url).fillna("O")

In [6]:
data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,1,Magnetic,O
1,1,and,O
2,1,thermoelectric,O
3,1,properties,O
4,1,of,O
5,1,the,O
6,1,ternary,O
7,1,pseudo-hollandite,O
8,1,BaxCr5Se8,target
9,1,(,O


# Pre-processing
## Get the sentece and labels

In [0]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        self.grouped = self.data.groupby("Sentence #").apply(
            lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                              s["Tag"].values.tolist())])
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)

### Take a look at the first sentence in the data

In [9]:
sentences = [" ".join([s[0].lower() for s in sent]) for sent in getter.sentences]
print(sentences[0])

magnetic and thermoelectric properties of the ternary pseudo-hollandite baxcr5se8 ( 0.5 < x < 0.55 ) solid solution .


### Get the word-level label

In [10]:
labels = [[s[1] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'target', 'O', 'number', 'O', 'O', 'O', 'number', 'O', 'O', 'O', 'O']


Create a dictionary that maps each word label into a number

In [11]:
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
print(tag2idx)

{'number': 0, 'O': 1, 'property-misc': 2, 'operation': 3, 'apparatus-property-type': 4, 'material': 5, 'unspecified-material': 6, 'precursor': 7, 'condition-unit': 8, 'property-type': 9, 'meta': 10, 'target': 11, 'amount-misc': 12, 'apparatus-descriptor': 13, 'brand': 14, 'nonrecipe-material': 15, 'solvent': 16, 'condition-misc': 17, 'synthesis-apparatus': 18, 'amount-unit': 19, 'reference': 20, 'apparatus-unit': 21, 'characterization-apparatus': 22, 'material-descriptor': 23, 'condition-type': 24, 'property-unit': 25, 'gas': 26}


# Apply BERT model
## Set constants and GPU processor

In [0]:
MAX_LEN = 64
BATCH_SIZE = 32

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("The number of GPU: {}".format(n_gpu))

The number of GPU: 1


In [14]:
torch.cuda.get_device_name(0) 

'Tesla T4'

## Get the pre-trained uncased word embeddings

In [15]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
tokenizer.add_tokens(["stirrer", "teflon", "autoclave", "degc"])

4

## Tokenize all the sentences

In [16]:
print(sentences[1431])
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_text[1431])

after further stirring with a magnetic stirrer for 6 h at room temperature , the gel mixture was transferred to a teflon - lined stainless - steel autoclave and heated at 140 degc for 9 days under tumbling ( 60 rpm ) .
['after', 'further', 'stirring', 'with', 'a', 'magnetic', 'stirrer', 'for', '6', 'h', 'at', 'room', 'temperature', ',', 'the', 'gel', 'mixture', 'was', 'transferred', 'to', 'a', 'teflon', '-', 'lined', 'stainless', '-', 'steel', 'autoclave', 'and', 'heated', 'at', '140', 'degc', 'for', '9', 'days', 'under', 'tumbling', '(', '60', 'rpm', ')', '.']


### Pad all the tokenized sentences and labels to the same length

In [17]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_text],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], dtype="long", truncating="post", padding="post")

print(input_ids[0])
print(tags[0])

[ 8060  1998  1996 10867  8913  2571 22601  5144  1997  1996 28774 24041
 18404  1011  7935  4221  8670  2595 26775  2629  3366  2620  1006  1014
  1012  1019  1026  1060  1026  1014  1012  4583  1007  5024  5576  1012
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[ 1  1  1  1  1  1  1  1 11  1  0  1  1  1  0  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1]


### Create attention masks for the attention model

In [0]:
attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]

## Split the data into 90% training set and 10% validation set

In [0]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

## Convert the data into Torch tensor format for later processing

In [0]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

### Define the training and validation data in the DataLoader for NLP model

In [0]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

# Finetuning the BERT model

In [22]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
model.resize_token_embeddings(len(tokenizer))

Embedding(30526, 768)

## Load the data into GPU

In [23]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30526, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermedia

In [0]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
# USE ADAM for gradient descent
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

## Define metrics for finetuning

In [0]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
epochs = 5
max_grad_norm = 1.0

In [27]:
for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # Use batch training to speed up training speed
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Forward-prop pass and loss computing
        loss, _ = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # Back-prop
        loss.backward()
        # track training loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # Gradient clipping to prevent gradient explosion
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), 
                                       max_norm=max_grad_norm)
        # Update parameters
        optimizer.step()
        model.zero_grad()
    print("Avg Training Loss Per Epoch: {}".format(tr_loss/nb_tr_steps))

    # Validation
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
            tmp_eval_loss, logits = outputs[:2]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to("cpu").numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1 score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Avg Training Loss Per Epoch: 1.7151688298241037


Epoch:  20%|██        | 1/5 [00:26<01:46, 26.64s/it]

Validation loss: 1.4318931443350655
Validation accuracy: 0.80447265625
F1 score: 0.004655493482309125
Avg Training Loss Per Epoch: 1.3479581445944113


Epoch:  40%|████      | 2/5 [00:54<01:20, 26.88s/it]

Validation loss: 1.2369588613510132
Validation accuracy: 0.7838337053571429
F1 score: 0.11196105702364394
Avg Training Loss Per Epoch: 1.1667899206036427


Epoch:  60%|██████    | 3/5 [01:21<00:54, 27.15s/it]

Validation loss: 1.1222437449863978
Validation accuracy: 0.7748939732142858
F1 score: 0.15112651646447142
Avg Training Loss Per Epoch: 1.049361505469338


Epoch:  80%|████████  | 4/5 [01:49<00:27, 27.20s/it]

Validation loss: 1.06821186201913
Validation accuracy: 0.7451757812500001
F1 score: 0.1746617466174662
Avg Training Loss Per Epoch: 0.9476961786629724


Epoch: 100%|██████████| 5/5 [02:16<00:00, 27.22s/it]

Validation loss: 1.006590826170785
Validation accuracy: 0.7338002232142857
F1 score: 0.1978655898471301





## Evaluate the model

In [28]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    print(b_input_ids)
    print(b_input_ids[1,:].tolist())
    with torch.no_grad():
        tmp_eval_loss, logits = model(b_input_ids, token_type_ids=None,
                                      attention_mask=b_input_mask, labels=b_labels)[:2]
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

tensor([[ 3525,  1010,  1996,  ...,     0,     0,     0],
        [ 2044,  2582, 18385,  ...,     0,     0,     0],
        [ 1996,  9059,  3815,  ...,     0,     0,     0],
        ...,
        [ 2005, 29248,  2487,  ...,     0,     0,     0],
        [ 1061,  2497,  2509,  ...,     0,     0,     0],
        [ 8915,  6494,  8458,  ...,  2656,  1033,  1012]], device='cuda:0')
[2044, 2582, 18385, 2007, 1037, 8060, 30522, 2005, 1020, 1044, 2012, 2282, 4860, 1010, 1996, 21500, 8150, 2001, 4015, 2000, 1037, 30523, 1011, 7732, 18676, 1011, 3886, 30524, 1998, 9685, 2012, 8574, 30525, 2005, 1023, 2420, 2104, 21552, 1006, 3438, 11575, 1007, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tensor([[ 3479,  9898,  2015,  ...,     0,     0,     0],
        [ 1996, 13749, 12868,  ...,     0,     0,     0],
        [ 1996, 28573,  2665,  ...,     0,     0,     0],
        ...,
        [ 2633,  1010,  1996,  ...,     0,     0,     0],
        [ 1996,  3988,  1998,  ...,     0,   

In [40]:
print(pred_tags[1][:43])

['O', 'O', 'operation', 'O', 'O', 'apparatus-descriptor', 'synthesis-apparatus', 'O', 'number', 'condition-unit', 'O', 'condition-misc', 'condition-misc', 'O', 'O', 'unspecified-material', 'unspecified-material', 'O', 'operation', 'O', 'O', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'synthesis-apparatus', 'O', 'operation', 'O', 'number', 'condition-unit', 'O', 'number', 'condition-unit', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [41]:
print(valid_tags[1][:43])

['O', 'O', 'operation', 'O', 'O', 'synthesis-apparatus', 'synthesis-apparatus', 'O', 'number', 'condition-unit', 'O', 'condition-misc', 'condition-misc', 'O', 'O', 'unspecified-material', 'unspecified-material', 'O', 'operation', 'O', 'O', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'apparatus-descriptor', 'synthesis-apparatus', 'O', 'operation', 'O', 'number', 'condition-unit', 'O', 'number', 'condition-unit', 'O', 'operation', 'O', 'number', 'condition-unit', 'O', 'O']


In [0]:
ids =  [2044, 2582, 18385, 2007, 1037, 8060, 30522, 2005, 1020, 1044, 2012, 2282, 4860, 1010, 1996, 21500, 8150, 2001, 4015, 2000, 1037, 30523, 1011, 7732, 18676, 1011, 3886, 30524, 1998, 9685, 2012, 8574, 30525, 2005, 1023, 2420, 2104, 21552, 1006, 3438, 11575, 1007, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [39]:
print(tokenizer.convert_ids_to_tokens(ids[:43]))

['after', 'further', 'stirring', 'with', 'a', 'magnetic', 'stirrer', 'for', '6', 'h', 'at', 'room', 'temperature', ',', 'the', 'gel', 'mixture', 'was', 'transferred', 'to', 'a', 'teflon', '-', 'lined', 'stainless', '-', 'steel', 'autoclave', 'and', 'heated', 'at', '140', 'degc', 'for', '9', 'days', 'under', 'tumbling', '(', '60', 'rpm', ')', '.']


# NER Visualization

In [0]:
from spacy import displacy

In [33]:
tags_uppercase = [tag.upper() for tag in tags_vals]
print(tags_uppercase)
print(len(tags_uppercase))

['NUMBER', 'O', 'PROPERTY-MISC', 'OPERATION', 'APPARATUS-PROPERTY-TYPE', 'MATERIAL', 'UNSPECIFIED-MATERIAL', 'PRECURSOR', 'CONDITION-UNIT', 'PROPERTY-TYPE', 'META', 'TARGET', 'AMOUNT-MISC', 'APPARATUS-DESCRIPTOR', 'BRAND', 'NONRECIPE-MATERIAL', 'SOLVENT', 'CONDITION-MISC', 'SYNTHESIS-APPARATUS', 'AMOUNT-UNIT', 'REFERENCE', 'APPARATUS-UNIT', 'CHARACTERIZATION-APPARATUS', 'MATERIAL-DESCRIPTOR', 'CONDITION-TYPE', 'PROPERTY-UNIT', 'GAS']
27


In [0]:
COLORS = {"AMOUNT-MISC": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "NUMBER": "linear-gradient(90deg, orange, cyan)",
          "AMOUNT-UNIT": "linear-gradient(90deg, red, orange)",
          "PROPERTY-MISC": "linear-gradient(90deg, purple 40%, yellow)",
          "MATERIAL": "#aa9cfc",
          "NONRECIPE-MATERIAL": "red",
          "TARGET": "#a4893d",
          "META": "yellow",
          "UNSPECIFIED-MATERIAL": "blue",
          "APPARATUS-UNIT": "linear-gradient(90deg, #e66465, #9198e5)",
          "MATERIAL-DESCRIPTOR": "#9198e5",
          "SOLVENT": "#e66465",
          "PROPERTY-TYPE": "brown",
          "PRECURSOR": "pink",
          "CONDITION-MISC": "#fc9ce7",
          "APPARATUS-PROPERTY-TYPE": "orange",
          "PROPERTY-UNIT": "linear-gradient(217deg, rgba(255,0,0,.8), rgba(255,0,0,0) 70.71%)",
          "CONDITION-UNIT": "linear-gradient(217deg, rgba(400,0,0,.8), rgba(50,0,0,0) 70.71%)",
          "APPARATUS-DESCRIPTOR": "#fea49f",
          "SYNTHESIS-APPARATUS": "#bf4aa8",
          "OPERATION": "#9e363a",
          "CHARACTERIZATION-APPARATUS": "#4f5f76", 
          "BRAND": "#e4decd",
          "CONDITION-TYPE": "#8bf0ba",
          "GAS": "#ffdc6a",
          "REFERENCE": "#feda6a"
         }

In [0]:
def ner_visualize(sentence, tags, colors=COLORS):
    sentence_concat = " ".join(sentence)
    ents = []
    start = 0
    end = 0
    for word, tag in zip(sentence, tags):
        end = start + len(word) - 1
        ents.append({"start": start, "end": end+1, "label": tag.upper()})
        start = end + 2

    test = [{"text": sentence_concat,
             "ents": ents,
             "title": None}]

    options = {"ents": [tag.upper() for tag in set(tags) if tag not in ["O"]], "colors": colors}
    displacy.render(test, style="ent", manual=True, options=options)

In [0]:
ner_visualize(["Compound A", "was", "made", "by", "compound B", "by", "heating",       "in", "the furncace",         "at", "300", "degree", "celsius", "."], 
              ["TARGET",     "O",   "O",    "O",  "MATERIAL",   "O",  "CONDITION-MISC","O",  "APPARATUS-DESCRIPTOR", "O",  "NUMBER", "O", "CONDITION-UNIT", "O"])