In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ner = pipeline("ner", aggregation_strategy='simple', device=0)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 998/998 [00:00<00:00, 249kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 1.33G/1.33G [02:50<00:00, 7.82MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 60.0/60.0 [00:00<00:00, 20.0kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.56MB/s]


In [3]:
import pickle

In [4]:
with open('ner_train.pkl', 'rb') as f:
  corpus_train = pickle.load(f)

with open('ner_test.pkl', 'rb') as f:
  corpus_test = pickle.load(f)

In [10]:
corpus_test[0:2]

[[('CRICKET', 'O'),
  ('-', 'O'),
  ('LEICESTERSHIRE', 'B-ORG'),
  ('TAKE', 'O'),
  ('OVER', 'O'),
  ('AT', 'O'),
  ('TOP', 'O'),
  ('AFTER', 'O'),
  ('INNINGS', 'O'),
  ('VICTORY', 'O'),
  ('.', 'O')],
 [('West', 'B-MISC'),
  ('Indian', 'I-MISC'),
  ('all-rounder', 'O'),
  ('Phil', 'B-PER'),
  ('Simmons', 'I-PER'),
  ('took', 'O'),
  ('four', 'O'),
  ('for', 'O'),
  ('38', 'O'),
  ('on', 'O'),
  ('Friday', 'O'),
  ('as', 'O'),
  ('Leicestershire', 'B-ORG'),
  ('beat', 'O'),
  ('Somerset', 'B-ORG'),
  ('by', 'O'),
  ('an', 'O'),
  ('innings', 'O'),
  ('and', 'O'),
  ('39', 'O'),
  ('runs', 'O'),
  ('in', 'O'),
  ('two', 'O'),
  ('days', 'O'),
  ('to', 'O'),
  ('take', 'O'),
  ('over', 'O'),
  ('at', 'O'),
  ('the', 'O'),
  ('head', 'O'),
  ('of', 'O'),
  ('the', 'O'),
  ('county', 'O'),
  ('championship', 'O'),
  ('.', 'O')]]

In [11]:
inputs = []
targets = []

for sentence_tag_pairs in corpus_test:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(token)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [15]:
print(inputs[9])

['He', 'was', 'well', 'backed', 'by', 'England', 'hopeful', 'Mark', 'Butcher', 'who', 'made', '70', 'as', 'Surrey', 'closed', 'on', '429', 'for', 'seven', ',', 'a', 'lead', 'of', '234', '.']


In [16]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

In [17]:
detokenizer.detokenize(inputs[9])

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [19]:
print(targets[9])

['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [20]:
ner(detokenizer.detokenize(inputs[9]))

[{'entity_group': 'LOC',
  'score': 0.99967515,
  'word': 'England',
  'start': 22,
  'end': 29},
 {'entity_group': 'PER',
  'score': 0.99974275,
  'word': 'Mark Butcher',
  'start': 38,
  'end': 50},
 {'entity_group': 'ORG',
  'score': 0.9996264,
  'word': 'Surrey',
  'start': 66,
  'end': 72}]

In [21]:
def compute_prediction(tokens, input_, ner_result):
  # map hugging face ner result to list of tags for later performance assessment
  # tokens is the original tokenized sentence
  # input_ is the detokenized string

  predicted_tags = []
  state = 'O' # keep track of state, so if O --> B, if B --> I, if I --> I
  current_index = 0
  for token in tokens:
    # find the token in the input_ (should be at or near the start)
    index = input_.find(token)
    assert(index >= 0)
    current_index += index # where we are currently pointing to

    # print(token, current_index) # debug

    # check if this index belongs to an entity and assign label
    tag = 'O'
    for entity in ner_result:
      if current_index >= entity['start'] and current_index < entity['end']:
        # then this token belongs to an entity
        if state == 'O':
          state = 'B'
        else:
          state = 'I'
        tag = f"{state}-{entity['entity_group']}"
        break
    if tag == 'O':
      # reset the state
      state = 'O'
    predicted_tags.append(tag)

    # remove the token from input_
    input_ = input_[index + len(token):]

    # update current_index
    current_index += len(token)

  # sanity check
  # print("len(predicted_tags)", len(predicted_tags))
  # print("len(tokens)", len(tokens))
  assert(len(predicted_tags) == len(tokens))
  return predicted_tags

In [22]:
input_ = detokenizer.detokenize(inputs[9])
ner_result = ner(input_)
ptags = compute_prediction(inputs[9], input_, ner_result)

In [24]:
input_

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [23]:
ner_result

[{'entity_group': 'LOC',
  'score': 0.99967515,
  'word': 'England',
  'start': 22,
  'end': 29},
 {'entity_group': 'PER',
  'score': 0.99974275,
  'word': 'Mark Butcher',
  'start': 38,
  'end': 50},
 {'entity_group': 'ORG',
  'score': 0.9996264,
  'word': 'Surrey',
  'start': 66,
  'end': 72}]

In [26]:
print(ptags)

['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [27]:
from sklearn.metrics import accuracy_score, f1_score

In [28]:
accuracy_score(targets[9], ptags)

1.0

In [29]:
for targ, pred in zip(targets[9], ptags):
  print(targ, pred)

O O
O O
O O
O O
O O
B-LOC B-LOC
O O
B-PER B-PER
I-PER I-PER
O O
O O
O O
O O
B-ORG B-ORG
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O


In [30]:
# get detokenized inputs to pass into ner model
detok_inputs = []
for tokens in inputs:
  text = detokenizer.detokenize(tokens)
  detok_inputs.append(text)

In [31]:
# 17 min on CPU, 3 min on GPU
ner_results = ner(detok_inputs)

In [32]:
predictions = []
for tokens, text, ner_result in zip(inputs, detok_inputs, ner_results):
    pred = compute_prediction(tokens, text, ner_result)
    predictions.append(pred)

In [33]:
# https://stackoverflow.com/questions/11264684/flatten-list-of-lists
def flatten(list_of_lists):
  flattened = [val for sublist in list_of_lists for val in sublist]
  return flattened

In [34]:
# flatten targets and predictions
flat_predictions = flatten(predictions)
flat_targets = flatten(targets)

In [39]:
accuracy_score(flat_targets, flat_predictions)

0.9916563354782848

In [40]:
f1_score(flat_targets, flat_predictions, average='macro')

0.95403328229255