<a href="https://colab.research.google.com/github/stwind/notebooks/blob/master/simple_ner_huggingface_ncbi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
#@title Dependencies
!pip install -Uq datasets transformers

[K     |████████████████████████████████| 245kB 5.2MB/s 
[K     |████████████████████████████████| 2.5MB 50.6MB/s 
[K     |████████████████████████████████| 122kB 53.3MB/s 
[K     |████████████████████████████████| 245kB 49.3MB/s 
[K     |████████████████████████████████| 901kB 51.6MB/s 
[K     |████████████████████████████████| 3.3MB 49.8MB/s 
[31mERROR: transformers 4.8.2 has requirement huggingface-hub==0.0.12, but you'll have huggingface-hub 0.0.13 which is incompatible.[0m
[?25h

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import io
import os
import re
import math
import time
import random
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import PIL
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as dset
import IPython.display
import datasets
import transformers
from fastprogress.fastprogress import progress_bar
from mpl_toolkits.mplot3d import Axes3D

from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, Trainer, TrainingArguments, AutoModelForTokenClassification, BertForTokenClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

sns.set('notebook', 'darkgrid', rc={
    'font.family': ['DejaVu Sans'],
    'font.sans-serif': ['Open Sans', 'Arial Unicode MS'],
    'font.size': 12,
    'figure.figsize': (8, 5),
    'grid.linewidth': 1,
    'grid.alpha': 0.5,
    'legend.fontsize': 10,
    'legend.frameon': True,
    'legend.framealpha': 0.6,
    'legend.handletextpad': 0.2,
    'lines.linewidth': 1,
    'axes.facecolor': '#fafafa',
    'axes.labelsize': 11,
    'axes.titlesize': 12,
    'axes.linewidth': 0.5,
    'xtick.labelsize': 11,
    'xtick.major.width': 0.5,
    'ytick.labelsize': 11,
    'ytick.major.width': 0.5,
    'figure.titlesize': 13,
})
plt.style.use("dark_background")

print("Infos:")
print("numpy: {}".format(np.__version__))
print("pandas: {}".format(pd.__version__))
print("seaborn: {}".format(sns.__version__))
print("matplotlib: {}".format(mpl.__version__))
print("pytorch: {}".format(torch.__version__))
print("huggingface transformers: {}".format(transformers.__version__))
print("huggingface datasets: {}".format(datasets.__version__))

for i in range(torch.cuda.device_count()):
    print("cuda device #{}: {}".format(i, torch.cuda.get_device_name(i)))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("current device type: {}".format(device.type))

!nvidia-smi

Infos:
numpy: 1.19.5
pandas: 1.1.5
seaborn: 0.11.1
matplotlib: 3.2.2
pytorch: 1.9.0+cu102
huggingface transformers: 4.8.2
huggingface datasets: 1.8.0
cuda device #0: Tesla P4
current device type: cuda
Thu Jul  1 08:05:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     7W /  75W |      2MiB /  7611MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+--------------

## Data

In [3]:
raw_dataset = load_dataset('ncbi_disease')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2279.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1549.0, style=ProgressStyle(description…


Downloading and preparing dataset ncbi_disease/ncbi_disease (download: 1.47 MiB, generated: 3.04 MiB, post-processed: Unknown size, total: 4.52 MiB) to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=283883.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=51200.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=52411.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset ncbi_disease downloaded and prepared to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03. Subsequent calls will reuse this data.


In [4]:
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

tokenizer = AutoTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=337.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=226150.0, style=ProgressStyle(descripti…




In [5]:
def map_entities_to_tokens(items, encodings, ignore_index=-100):
    encoded_labels = []
    for item, encoding in zip(items, encodings):
        offsets = np.array(encoding.offsets)
        doc_enc_labels = np.ones(len(offsets), dtype=int) * ignore_index

        doc_enc_labels[(offsets[:,0] == 0) & (offsets[:,1] != 0)] = item['ner_tags']
        encoded_labels.append(doc_enc_labels)

    return np.stack(encoded_labels)

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, raw_dataset):
        texts = [item["tokens"] for item in raw_dataset]
        encodings = tokenizer(texts, padding=True, truncation=True, max_length=256, is_split_into_words=True, return_tensors='pt')
        self.encodings = encodings
        self.labels = torch.tensor(map_entities_to_tokens(raw_dataset, encodings.encodings))

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(raw_dataset['train'])
val_dataset = NERDataset(raw_dataset['validation'])
test_dataset = NERDataset(raw_dataset['test'])

In [6]:
print(tokenizer.decode(train_dataset[0]['input_ids'], skip_special_tokens=False))
list(zip(train_dataset.encodings[0].tokens, train_dataset[0]['labels']))[:20]

[CLS] identification of apc2, a homologue of the adenomatous polyposis coli tumour suppressor. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


[('[CLS]', tensor(-100)),
 ('identification', tensor(0)),
 ('of', tensor(0)),
 ('apc', tensor(0)),
 ('##2', tensor(-100)),
 (',', tensor(0)),
 ('a', tensor(0)),
 ('homologue', tensor(0)),
 ('of', tensor(0)),
 ('the', tensor(0)),
 ('adenomatous', tensor(1)),
 ('polyposis', tensor(2)),
 ('coli', tensor(2)),
 ('tumour', tensor(2)),
 ('suppressor', tensor(0)),
 ('.', tensor(0)),
 ('[SEP]', tensor(-100)),
 ('[PAD]', tensor(-100)),
 ('[PAD]', tensor(-100)),
 ('[PAD]', tensor(-100))]

## Training

In [8]:
def compute_metrics(pred):
    labels, preds = pred.label_ids, pred.predictions.argmax(-1)

    mask = (labels != -100) & ((labels != 0) | (preds != 0))
    precision, recall, f1, _ = precision_recall_fscore_support(labels[mask], preds[mask], average='micro')

    mask = labels != -100
    acc = accuracy_score(labels[mask], preds[mask])
    return {'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall}

In [10]:
all_labels = list(set([label for item in raw_dataset["train"] for label in item["ner_tags"]]))
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(all_labels))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=len(train_dataset) // 8,
    weight_decay=0.01,
    logging_dir='./logs',
    log_level="warning",
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    no_cuda=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440472042.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForToken

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,No log,0.08113,0.974843,0.697744,0.697744,0.697744
400,No log,0.061751,0.982811,0.789366,0.789366,0.789366
600,0.185400,0.051056,0.98448,0.817378,0.817378,0.817378
800,0.185400,0.062748,0.985898,0.8286,0.8286,0.8286
1000,0.042700,0.074588,0.982227,0.778931,0.778931,0.778931
1200,0.042700,0.049116,0.98594,0.82876,0.82876,0.82876
1400,0.042700,0.045392,0.988193,0.857789,0.857789,0.857789
1600,0.027700,0.060122,0.986232,0.833165,0.833165,0.833165
1800,0.027700,0.058129,0.987734,0.850458,0.850458,0.850458
2000,0.012100,0.054028,0.987984,0.854251,0.854251,0.854251


TrainOutput(global_step=2040, training_loss=0.06599473932794496, metrics={'train_runtime': 736.8958, 'train_samples_per_second': 22.118, 'train_steps_per_second': 2.768, 'total_flos': 1469586210067260.0, 'train_loss': 0.06599473932794496, 'epoch': 3.0})

In [11]:
trainer.evaluate(test_dataset)

{'epoch': 3.0,
 'eval_accuracy': 0.985712536228926,
 'eval_f1': 0.8453380468404772,
 'eval_loss': 0.045956481248140335,
 'eval_precision': 0.8453380468404772,
 'eval_recall': 0.8453380468404772,
 'eval_runtime': 8.3893,
 'eval_samples_per_second': 112.167,
 'eval_steps_per_second': 14.066}

## Results

In [12]:
model = AutoModelForTokenClassification.from_pretrained("results/checkpoint-2000")
nlp = pipeline("ner", tokenizer=tokenizer, model=model)

In [68]:
print(raw_dataset["test"][0])

res = nlp(raw_dataset["test"][0]["tokens"])
res[:10]

{'id': '0', 'ner_tags': [0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 0], 'tokens': ['Clustering', 'of', 'missense', 'mutations', 'in', 'the', 'ataxia', '-', 'telangiectasia', 'gene', 'in', 'a', 'sporadic', 'T', '-', 'cell', 'leukaemia', '.']}


[[{'end': 10,
   'entity': 'LABEL_0',
   'index': 1,
   'score': 0.9998749,
   'start': 0,
   'word': 'clustering'}],
 [{'end': 2,
   'entity': 'LABEL_0',
   'index': 1,
   'score': 0.99975467,
   'start': 0,
   'word': 'of'}],
 [{'end': 8,
   'entity': 'LABEL_0',
   'index': 1,
   'score': 0.9999472,
   'start': 0,
   'word': 'missense'}],
 [{'end': 9,
   'entity': 'LABEL_0',
   'index': 1,
   'score': 0.9999426,
   'start': 0,
   'word': 'mutations'}],
 [{'end': 2,
   'entity': 'LABEL_0',
   'index': 1,
   'score': 0.9997902,
   'start': 0,
   'word': 'in'}],
 [{'end': 3,
   'entity': 'LABEL_0',
   'index': 1,
   'score': 0.9997935,
   'start': 0,
   'word': 'the'}],
 [{'end': 6,
   'entity': 'LABEL_1',
   'index': 1,
   'score': 0.9525135,
   'start': 0,
   'word': 'ataxia'}],
 [{'end': 1,
   'entity': 'LABEL_0',
   'index': 1,
   'score': 0.9999193,
   'start': 0,
   'word': '-'}],
 [{'end': 10,
   'entity': 'LABEL_1',
   'index': 1,
   'score': 0.99511755,
   'start': 0,
   'word'

In [69]:
from spacy.displacy import EntityRenderer

def make_spacy_entity(results):
    ents = []
    text = ""
    for spans in results:
        word = spans[0]['word']
        if spans[0]['entity'] == "LABEL_0":
            pass
        elif spans[0]['entity'] == "LABEL_1":
            for span in spans[1:]:
                word += span['word'][2:]
            ent = {"label": "DISEASE", "start": len(text), "end": len(text) + spans[-1]['end'] + 1}
            ents.append(ent)

        if text:
            text += " "
        text += word
    return text, ents

er = EntityRenderer()

In [81]:
page = ''

for i in range(10):
    res = nlp(raw_dataset["test"][i]["tokens"])
    text, ents = make_spacy_entity(res)

    page += er.render([{"text": text, "ents": ents}])

IPython.display.HTML(page)