<a href="https://colab.research.google.com/github/smf-9000/Named-Entity-Recognition/blob/main/NER_with_hf_transformer_%5Bwith_pipeline%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
[https://huggingface.co/transformers/custom_datasets.html]
```



In [1]:
!pip install pandas transformers

Collecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 8.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.4 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.2


In [2]:
! wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2021-07-20 09:30:10--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.109.153, 185.199.108.153, 185.199.110.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.109.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll’


2021-07-20 09:30:10 (13.0 MB/s) - ‘wnut17train.conll’ saved [493781/493781]



In [3]:
from pathlib import Path
import re


In [4]:
file_path = Path("/content/wnut17train.conll")
raw_text = file_path.read_text().strip()
raw_docs = re.split(r'\n\t?\n', raw_text)

token_docs = []
tag_docs = []
for doc in raw_docs:
  tokens = []
  tags = []
  for line in doc.split('\n'):
    token, tag = line.split('\t')
    tokens.append(token)
    tags.append(tag)
  token_docs.append(tokens)
  tag_docs.append(tags)

print(token_docs[10])
print(tag_docs[10])

['@Suzie55', 'whispering', 'cause', 'I', 'may', 'have', 'had', '1', 'too', 'many', 'vodka', "'s", 'last', 'night', 'and', 'am', 'a', 'lil', 'fragile', ',', 'hold', 'me', '?']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [5]:
len(token_docs)

3394

In [6]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(token_docs, tag_docs, test_size=.2)

## encodings for our tokens and tags

In [7]:
unique_tags = set(tag for doc in tag_docs for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# print(tag2id)
# print(id2tag)

In [8]:
id2tag

{0: 'B-location',
 1: 'B-corporation',
 2: 'B-product',
 3: 'I-creative-work',
 4: 'B-person',
 5: 'I-product',
 6: 'I-location',
 7: 'O',
 8: 'I-corporation',
 9: 'I-group',
 10: 'B-creative-work',
 11: 'I-person',
 12: 'B-group'}

In [9]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…






```
train_encodings={
  'input_ids': [...],
  'offset_mapping': [...]
}
"the offset mapping gives us a tuple indicating the sub-token’s start position and end position relative to the original token it was split from"
```



In [None]:
train_encodings

In [11]:
# https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities

import numpy as np

def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())

  return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
print(train_labels)

In [13]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [14]:
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [15]:
training_args = TrainingArguments(
  output_dir='./results',          # output directory
  num_train_epochs=3,              # total number of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=64,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.0001,               # strength of weight decay
  logging_dir='./logs',            # directory for storing logs
  logging_steps=10,
  evaluation_strategy='epoch'
)

# In case we want to freeze the pretrained part of a model:
# for param in model.base_model.parameters():
#     param.requires_grad = False

In [16]:
trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 2715
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 510


Epoch,Training Loss,Validation Loss
1,0.1815,0.219885
2,0.1861,0.155579
3,0.1036,0.147529


***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=510, training_loss=0.400557611619725, metrics={'train_runtime': 76.4641, 'train_samples_per_second': 106.521, 'train_steps_per_second': 6.67, 'total_flos': 286773158054700.0, 'train_loss': 0.400557611619725, 'epoch': 3.0})

In [17]:
model.eval()
from torch.nn import functional as F

In [80]:
# example = 'Microsoft moved its headquarters from Bellevue to Redmond, Washington, on February 26, 1986, and went public on March 13.'
# example = 'Microsoft released Microsoft Windows on November 20, 1985, as a graphical extension for MS-DOS'
# example = 'Huggingface is the best company.'
# example = 'Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software, and online services.'
# example = 'Apple has expanded its campuses in Austin, Texas, concurrently with building Apple Park in Cupertino.'
example = 'Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.'

device = "cuda:0"
with torch.no_grad():
  inputs = tokenizer(example, return_tensors="pt").to(device)
  outputs = model(**inputs)
  softmax = F.softmax(outputs[0][0], dim = 1)
  indices = [x.argmax().item() for x in softmax]
  input_list = inputs['input_ids'].tolist()[0]
  # print(outputs[0][0])
  # print(softmax)
  print('tags:', id2tag)
  print('pred_ids:', indices)
  print('input_tokens', input_list)
  # print(inputs['input_ids'].tolist()[0])

  word2tokens = {x : tokenizer.encode(x, add_special_tokens=False) for x in example.split()}

  # print(word2tokens)
  # print(tokenizer.decode(101))

  for i in range(len(indices)):
    print('token: ', input_list[i], '\ttag: ', id2tag[indices[i]], '  ', 'entity: ', tokenizer.decode(input_list[i]))

tags: {0: 'B-location', 1: 'B-corporation', 2: 'B-product', 3: 'I-creative-work', 4: 'B-person', 5: 'I-product', 6: 'I-location', 7: 'O', 8: 'I-corporation', 9: 'I-group', 10: 'B-creative-work', 11: 'I-person', 12: 'B-group'}
pred_ids: [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 6, 6, 7, 7, 7, 7, 7, 0, 6, 6, 7, 7, 7, 7, 7, 7, 0, 6, 7, 7, 7, 7, 7, 7, 7, 7]
input_tokens [101, 20164, 10932, 10289, 3561, 119, 1110, 170, 1419, 1359, 1107, 1203, 1365, 1392, 119, 2098, 3834, 1132, 1107, 141, 25810, 23904, 117, 3335, 1304, 1601, 1106, 1103, 6545, 3640, 1134, 1110, 5085, 1121, 1103, 2487, 119, 102]
token:  101 	tag:  O    entity:  [CLS]
token:  20164 	tag:  O    entity:  Hu
token:  10932 	tag:  O    entity:  ##gging
token:  10289 	tag:  O    entity:  Face
token:  3561 	tag:  O    entity:  Inc
token:  119 	tag:  O    entity:  .
token:  1110 	tag:  O    entity:  is
token:  170 	tag:  O    entity:  a
token:  1419 	tag:  O    entity:  company
token:  1359 	tag:  O    entity:  based
token:  1107 	tag:  O  

In [81]:
inputs

{'input_ids': tensor([[  101, 20164, 10932, 10289,  3561,   119,  1110,   170,  1419,  1359,
          1107,  1203,  1365,  1392,   119,  2098,  3834,  1132,  1107,   141,
         25810, 23904,   117,  3335,  1304,  1601,  1106,  1103,  6545,  3640,
          1134,  1110,  5085,  1121,  1103,  2487,   119,   102]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [82]:
example

'Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window.'

In [83]:
from transformers import pipeline

ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='max', device=0)

res = ner(example)
print(res)

for e in res:
  lbl = id2tag[int(e['entity_group'].split('_')[1])]
  print(e['start'], '\t', e['end'], '\t', e['score'], '\t',  lbl, '\t', e['word'])

[{'entity_group': 'LABEL_7', 'score': 0.5328488, 'word': 'Hugging', 'start': 0, 'end': 7}, {'entity_group': 'LABEL_7', 'score': 0.37571302, 'word': 'Face', 'start': 8, 'end': 12}, {'entity_group': 'LABEL_7', 'score': 0.37856364, 'word': 'Inc', 'start': 13, 'end': 16}, {'entity_group': 'LABEL_7', 'score': 0.6330844, 'word': '.', 'start': 16, 'end': 17}, {'entity_group': 'LABEL_7', 'score': 0.9349889, 'word': 'is', 'start': 18, 'end': 20}, {'entity_group': 'LABEL_7', 'score': 0.93747663, 'word': 'a', 'start': 21, 'end': 22}, {'entity_group': 'LABEL_7', 'score': 0.92972744, 'word': 'company', 'start': 23, 'end': 30}, {'entity_group': 'LABEL_7', 'score': 0.94977075, 'word': 'based', 'start': 31, 'end': 36}, {'entity_group': 'LABEL_7', 'score': 0.92869526, 'word': 'in', 'start': 37, 'end': 39}, {'entity_group': 'LABEL_0', 'score': 0.97096074, 'word': 'New', 'start': 40, 'end': 43}, {'entity_group': 'LABEL_6', 'score': 0.94642496, 'word': 'York', 'start': 44, 'end': 48}, {'entity_group': 'LA

In [84]:
for e in res:
  lbl = id2tag[int(e['entity_group'].split('_')[1])]
  del e['entity_group']
  e['entity'] = lbl
res = ner.group_entities(res)
for e in res:
  print(e)

{'entity_group': 'O', 'score': 0.5328488, 'word': 'Hugging', 'start': 0, 'end': 7}
{'entity_group': 'O', 'score': 0.37571302, 'word': 'Face', 'start': 8, 'end': 12}
{'entity_group': 'O', 'score': 0.37856364, 'word': 'Inc', 'start': 13, 'end': 16}
{'entity_group': 'O', 'score': 0.6330844, 'word': '.', 'start': 16, 'end': 17}
{'entity_group': 'O', 'score': 0.9349889, 'word': 'is', 'start': 18, 'end': 20}
{'entity_group': 'O', 'score': 0.93747663, 'word': 'a', 'start': 21, 'end': 22}
{'entity_group': 'O', 'score': 0.92972744, 'word': 'company', 'start': 23, 'end': 30}
{'entity_group': 'O', 'score': 0.94977075, 'word': 'based', 'start': 31, 'end': 36}
{'entity_group': 'O', 'score': 0.92869526, 'word': 'in', 'start': 37, 'end': 39}
{'entity_group': 'location', 'score': 0.9588005, 'word': 'New York City', 'start': 40, 'end': 53}
{'entity_group': 'O', 'score': 0.9895925, 'word': '.', 'start': 53, 'end': 54}
{'entity_group': 'O', 'score': 0.9474311, 'word': 'Its', 'start': 55, 'end': 58}
{'ent