<a href="https://colab.research.google.com/github/smf-9000/Named-Entity-Recognition/blob/main/NER_with_hf_transformer_%5Bwith_pipeline%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
[https://huggingface.co/transformers/custom_datasets.html
https://huggingface.co/transformers/main_classes/pipelines.html#tokenclassificationpipeline]
```



In [2]:
!pip install pandas transformers



In [3]:
! wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2021-07-20 12:58:41--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll.1’


2021-07-20 12:58:41 (19.8 MB/s) - ‘wnut17train.conll.1’ saved [493781/493781]



In [4]:
from pathlib import Path
import re


In [5]:
file_path = Path("/content/wnut17train.conll")
raw_text = file_path.read_text().strip()
raw_docs = re.split(r'\n\t?\n', raw_text)

token_docs = []
tag_docs = []
for doc in raw_docs:
  tokens = []
  tags = []
  for line in doc.split('\n'):
    token, tag = line.split('\t')
    tokens.append(token)
    tags.append(tag)
  token_docs.append(tokens)
  tag_docs.append(tags)

print(token_docs[10])
print(tag_docs[10])

['@Suzie55', 'whispering', 'cause', 'I', 'may', 'have', 'had', '1', 'too', 'many', 'vodka', "'s", 'last', 'night', 'and', 'am', 'a', 'lil', 'fragile', ',', 'hold', 'me', '?']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [6]:
len(token_docs)

3394

In [7]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(token_docs, tag_docs, test_size=.2)

## encodings for our tokens and tags

In [8]:
unique_tags = set(tag for doc in tag_docs for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# print(tag2id)
# print(id2tag)

In [9]:
id2tag

{0: 'B-creative-work',
 1: 'B-product',
 2: 'O',
 3: 'I-product',
 4: 'B-location',
 5: 'B-group',
 6: 'I-corporation',
 7: 'I-person',
 8: 'I-group',
 9: 'I-location',
 10: 'I-creative-work',
 11: 'B-person',
 12: 'B-corporation'}

In [10]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)



```
train_encodings={
  'input_ids': [...],
  'offset_mapping': [...]
}
"the offset mapping gives us a tuple indicating the sub-token’s start position and end position relative to the original token it was split from"
```



In [None]:
train_encodings

In [12]:
# https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities

import numpy as np

def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())

  return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
print(train_labels)

In [14]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [19]:
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments

model = DistilBertForTokenClassification.from_pretrained(
                                                  'distilbert-base-cased',
                                                  num_labels=len(unique_tags),
                                                  id2label=id2tag,
                                                  label2id=tag2id)

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "B-creative-work",
    "1": "B-product",
    "2": "O",
    "3": "I-product",
    "4": "B-location",
    "5": "B-group",
    "6": "I-corporation",
    "7": "I-person",
    "8": "I-group",
    "9": "I-location",
    "10": "I-creative-work",
    "11": "B-person",
    "12": "B-corporation"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-corporation": 12,
    "B-creative-work": 0,
    "B-group": 5,
    "B-location": 4,
    "B-person": 11,
    "B-product": 1,
    "I-corporation": 6,
    "I-creative-work": 10,
    "I-group": 8,
    "I-location": 9,
 

In [20]:
training_args = TrainingArguments(
  output_dir='./results',          # output directory
  num_train_epochs=3,              # total number of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=64,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.0001,               # strength of weight decay
  logging_dir='./logs',            # directory for storing logs
  logging_steps=10,
  evaluation_strategy='epoch'
)

# In case we want to freeze the pretrained part of a model:
# for param in model.base_model.parameters():
#     param.requires_grad = False

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 2715
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 510


Epoch,Training Loss,Validation Loss
1,0.251,0.209598
2,0.0972,0.125589
3,0.0724,0.144984


***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=510, training_loss=0.3489976295069152, metrics={'train_runtime': 82.7704, 'train_samples_per_second': 98.405, 'train_steps_per_second': 6.162, 'total_flos': 286773158054700.0, 'train_loss': 0.3489976295069152, 'epoch': 3.0})

In [22]:
model.eval()
from torch.nn import functional as F

In [23]:
# example = 'Microsoft moved its headquarters from Bellevue to Redmond, Washington, on February 26, 1986, and went public on March 13.'
# example = 'Microsoft released Microsoft Windows on November 20, 1985, as a graphical extension for MS-DOS'
# example = 'Huggingface is the best company.'
# example = 'Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software, and online services.'
# example = 'Apple has expanded its campuses in Austin, Texas, concurrently with building Apple Park in Cupertino.'
example = 'Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.'

device = "cuda:0"
with torch.no_grad():
  inputs = tokenizer(example, return_tensors="pt").to(device)
  outputs = model(**inputs)
  softmax = F.softmax(outputs[0][0], dim = 1)
  indices = [x.argmax().item() for x in softmax]
  input_list = inputs['input_ids'].tolist()[0]
  # print(outputs[0][0])
  # print(softmax)
  print('tags:', id2tag)
  print('pred_ids:', indices)
  print('input_tokens', input_list)
  # print(inputs['input_ids'].tolist()[0])

  word2tokens = {x : tokenizer.encode(x, add_special_tokens=False) for x in example.split()}

  # print(word2tokens)
  # print(tokenizer.decode(101))

  for i in range(len(indices)):
    print('token: ', input_list[i], '\ttag: ', id2tag[indices[i]], '  ', 'entity: ', tokenizer.decode(input_list[i]))

tags: {0: 'B-creative-work', 1: 'B-product', 2: 'O', 3: 'I-product', 4: 'B-location', 5: 'B-group', 6: 'I-corporation', 7: 'I-person', 8: 'I-group', 9: 'I-location', 10: 'I-creative-work', 11: 'B-person', 12: 'B-corporation'}
pred_ids: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 9, 9, 2, 2, 2, 2, 2, 4, 9, 9, 2, 2, 2, 2, 2, 2, 4, 9, 2, 2, 2, 2, 2, 2, 2, 2]
input_tokens [101, 20164, 10932, 10289, 3561, 119, 1110, 170, 1419, 1359, 1107, 1203, 1365, 1392, 119, 2098, 3834, 1132, 1107, 141, 25810, 23904, 117, 3335, 1304, 1601, 1106, 1103, 6545, 3640, 1134, 1110, 5085, 1121, 1103, 2487, 119, 102]
token:  101 	tag:  O    entity:  [CLS]
token:  20164 	tag:  O    entity:  Hu
token:  10932 	tag:  O    entity:  ##gging
token:  10289 	tag:  O    entity:  Face
token:  3561 	tag:  O    entity:  Inc
token:  119 	tag:  O    entity:  .
token:  1110 	tag:  O    entity:  is
token:  170 	tag:  O    entity:  a
token:  1419 	tag:  O    entity:  company
token:  1359 	tag:  O    entity:  based
token:  1107 	tag:  O  

In [24]:
inputs

{'input_ids': tensor([[  101, 20164, 10932, 10289,  3561,   119,  1110,   170,  1419,  1359,
          1107,  1203,  1365,  1392,   119,  2098,  3834,  1132,  1107,   141,
         25810, 23904,   117,  3335,  1304,  1601,  1106,  1103,  6545,  3640,
          1134,  1110,  5085,  1121,  1103,  2487,   119,   102]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [25]:
example

'Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.'

In [47]:
from transformers import pipeline

ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='max', device=0, ignore_labels=[])

res = ner(example)
print(res)

print(example)
for e in res:
  lbl = 'entity' if ner.aggregation_strategy.value == 'none' else 'entity_group'
  print(e['start'], '\t', e['end'], '\t', e['score'], '\t',  e[lbl], '\t', e['word'])

[{'entity_group': 'O', 'score': 0.73502946, 'word': 'Hugging', 'start': 0, 'end': 7}, {'entity_group': 'O', 'score': 0.31519404, 'word': 'Face', 'start': 8, 'end': 12}, {'entity_group': 'O', 'score': 0.6754972, 'word': 'Inc', 'start': 13, 'end': 16}, {'entity_group': 'O', 'score': 0.9639095, 'word': '.', 'start': 16, 'end': 17}, {'entity_group': 'O', 'score': 0.986017, 'word': 'is', 'start': 18, 'end': 20}, {'entity_group': 'O', 'score': 0.99108684, 'word': 'a', 'start': 21, 'end': 22}, {'entity_group': 'O', 'score': 0.99009395, 'word': 'company', 'start': 23, 'end': 30}, {'entity_group': 'O', 'score': 0.9878843, 'word': 'based', 'start': 31, 'end': 36}, {'entity_group': 'O', 'score': 0.9914327, 'word': 'in', 'start': 37, 'end': 39}, {'entity_group': 'location', 'score': 0.93191975, 'word': 'New York City', 'start': 40, 'end': 53}, {'entity_group': 'O', 'score': 0.9907745, 'word': '.', 'start': 53, 'end': 54}, {'entity_group': 'O', 'score': 0.99356544, 'word': 'Its', 'start': 55, 'end'

In [None]:
# # make sense if aggregation_strategy is 'none'
# for e in res:
#   if 'entity_group' in e:
#     e['entity'] = e['entity_group']
#     del e['entity_group']
# res = ner.group_entities(res)
# for e in res:
#   print(e)

In [48]:
# the list of labels to be ignore added
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='max', device=0, ignore_labels=['O'])

res = ner(example)

print(example)
for e in res:
  lbl = 'entity' if ner.aggregation_strategy.value == 'none' else 'entity_group'
  print(e['start'], '\t', e['end'], '\t', e['score'], '\t',  e[lbl], '\t', e['word'])

Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.
40 	 53 	 0.93191975 	 location 	 New York City
79 	 84 	 0.755898 	 location 	 DUMBO
114 	 130 	 0.8169936 	 location 	 Manhattan Bridge
