<a href="https://colab.research.google.com/github/smf-9000/Named-Entity-Recognition/blob/main/NER_with_hf_transformer_%5BNER_start%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas transformers

In [2]:
! wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2021-07-05 16:18:12--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.109.153, 185.199.108.153, 185.199.111.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.109.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: â€˜wnut17train.conllâ€™


2021-07-05 16:18:13 (18.5 MB/s) - â€˜wnut17train.conllâ€™ saved [493781/493781]



In [3]:
from pathlib import Path
import re


In [4]:
file_path = Path("/content/wnut17train.conll")
raw_text = file_path.read_text().strip()
raw_docs = re.split(r'\n\t?\n', raw_text)

token_docs = []
tag_docs = []
for doc in raw_docs:
  tokens = []
  tags = []
  for line in doc.split('\n'):
    token, tag = line.split('\t')
    tokens.append(token)
    tags.append(tag)
  token_docs.append(tokens)
  tag_docs.append(tags)

print(token_docs[0])
print(tag_docs[0])

['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'O', 'B-location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [5]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(token_docs, tag_docs, test_size=.2)

## encodings for our tokens and tags

In [6]:
unique_tags = set(tag for doc in tag_docs for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# print(tag2id)
# print(id2tag)

In [28]:
id2tag

{0: 'I-location',
 1: 'B-group',
 2: 'B-location',
 3: 'B-creative-work',
 4: 'I-product',
 5: 'I-creative-work',
 6: 'B-product',
 7: 'I-corporation',
 8: 'I-group',
 9: 'B-corporation',
 10: 'O',
 11: 'B-person',
 12: 'I-person'}

In [7]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descriptiâ€¦




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descriptiâ€¦




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_wâ€¦






```
train_encodings={
  'input_ids': [...],
  'offset_mapping': [...]
}
```



In [8]:
import numpy as np

def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())

  return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
print(train_labels)

In [10]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [11]:
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_â€¦




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descriâ€¦




Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [13]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


In [None]:
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [17]:
model.eval()
from torch.nn import functional as F

In [93]:
# example = 'Microsoft moved its headquarters from Bellevue to Redmond, Washington, on February 26, 1986, and went public on March 13.'
example = 'Microsoft released Microsoft Windows on November 20, 1985, as a graphical extension for MS-DOS'

device = "cuda:0"
with torch.no_grad():
  inputs = tokenizer(example, return_tensors="pt").to(device)
  outputs = model(**inputs)
  softmax = F.softmax(outputs[0][0], dim = 1)
  indices = [x.argmax().item() for x in softmax]
  input_list = inputs['input_ids'].tolist()[0]
  # print(outputs[0][0])
  print(softmax)
  # print(indices)
  # print(inputs['input_ids'].tolist()[0])

  word2tokens = {x : tokenizer.encode(x, add_special_tokens=False) for x in example.split()}
  first_token2word = {}

  # I dont like this, but I'll fix it later [TODO]
  for k, v_list in word2tokens.items():
    for v in v_list:
      first_token2word[v] = k

  # print(word2tokens)
  # print(first_token2word)

  for i in range(len(indices)):
    print('token: ', input_list[i], 'tag: ', id2tag[indices[i]], 'entity: ', '#' if id2tag[indices[i]] == 'O' else first_token2word[input_list[i]])

tensor([[3.4797e-02, 3.1239e-02, 3.0316e-02, 5.7111e-02, 1.5203e-01, 7.8507e-02,
         9.3396e-02, 4.8696e-02, 2.4718e-02, 9.6761e-02, 3.0115e-01, 3.3284e-02,
         1.8003e-02],
        [1.9969e-03, 6.8352e-03, 5.2004e-03, 9.7534e-03, 1.0044e-01, 2.7507e-03,
         6.2422e-01, 1.6653e-02, 4.5299e-03, 1.2075e-01, 9.6341e-02, 7.6962e-03,
         2.8358e-03],
        [2.3906e-03, 1.0752e-03, 1.2726e-03, 4.5765e-03, 5.8097e-02, 6.6285e-03,
         2.2593e-02, 3.1907e-03, 1.9288e-03, 4.9267e-03, 8.9069e-01, 1.9084e-03,
         7.1777e-04],
        [1.6766e-03, 6.0398e-03, 4.5909e-03, 1.0834e-02, 8.7048e-02, 1.6999e-03,
         6.9985e-01, 9.8661e-03, 2.3916e-03, 9.2466e-02, 7.5089e-02, 6.9726e-03,
         1.4721e-03],
        [6.6775e-03, 2.5535e-03, 1.4692e-03, 4.1085e-03, 6.1036e-01, 6.7099e-03,
         2.0191e-01, 5.3964e-02, 1.3369e-02, 2.6140e-02, 6.1548e-02, 3.5261e-03,
         7.6622e-03],
        [3.9680e-04, 1.6596e-04, 1.9197e-04, 6.4246e-04, 2.9896e-03, 1.0570e-03,

In [None]:
inputs