<a href="https://colab.research.google.com/github/smf-9000/Named-Entity-Recognition/blob/main/NER_with_hf_transformer_%5BNER_start%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas transformers

In [2]:
! wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2021-07-05 16:18:12--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.109.153, 185.199.108.153, 185.199.111.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.109.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll’


2021-07-05 16:18:13 (18.5 MB/s) - ‘wnut17train.conll’ saved [493781/493781]



In [3]:
from pathlib import Path
import re


In [4]:
file_path = Path("/content/wnut17train.conll")
raw_text = file_path.read_text().strip()
raw_docs = re.split(r'\n\t?\n', raw_text)

token_docs = []
tag_docs = []
for doc in raw_docs:
  tokens = []
  tags = []
  for line in doc.split('\n'):
    token, tag = line.split('\t')
    tokens.append(token)
    tags.append(tag)
  token_docs.append(tokens)
  tag_docs.append(tags)

print(token_docs[0])
print(tag_docs[0])

['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'O', 'B-location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [5]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(token_docs, tag_docs, test_size=.2)

## encodings for our tokens and tags

In [6]:
unique_tags = set(tag for doc in tag_docs for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# print(tag2id)
# print(id2tag)

In [28]:
id2tag

{0: 'I-location',
 1: 'B-group',
 2: 'B-location',
 3: 'B-creative-work',
 4: 'I-product',
 5: 'I-creative-work',
 6: 'B-product',
 7: 'I-corporation',
 8: 'I-group',
 9: 'B-corporation',
 10: 'O',
 11: 'B-person',
 12: 'I-person'}

In [7]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…






```
train_encodings={
  'input_ids': [...],
  'offset_mapping': [...]
}
```



In [8]:
import numpy as np

def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())

  return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
print(train_labels)

In [10]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [None]:
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

In [13]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [17]:
model.eval()
from torch.nn import functional as F

In [94]:
example = 'Microsoft moved its headquarters from Bellevue to Redmond, Washington, on February 26, 1986, and went public on March 13.'
# example = 'Microsoft released Microsoft Windows on November 20, 1985, as a graphical extension for MS-DOS'

device = "cuda:0"
with torch.no_grad():
  inputs = tokenizer(example, return_tensors="pt").to(device)
  outputs = model(**inputs)
  softmax = F.softmax(outputs[0][0], dim = 1)
  indices = [x.argmax().item() for x in softmax]
  input_list = inputs['input_ids'].tolist()[0]
  # print(outputs[0][0])
  print(softmax)
  # print(indices)
  # print(inputs['input_ids'].tolist()[0])

  word2tokens = {x : tokenizer.encode(x, add_special_tokens=False) for x in example.split()}
  first_token2word = {}

  # I dont like this, but I'll fix it later [TODO]
  for k, v_list in word2tokens.items():
    for v in v_list:
      first_token2word[v] = k

  # print(word2tokens)
  # print(first_token2word)

  for i in range(len(indices)):
    print('token: ', input_list[i], 'tag: ', id2tag[indices[i]], 'entity: ', '#' if id2tag[indices[i]] == 'O' else first_token2word[input_list[i]])

tensor([[5.1131e-02, 5.9880e-02, 7.1346e-02, 6.6346e-02, 7.7075e-02, 9.8168e-02,
         5.8292e-02, 5.5508e-02, 3.2692e-02, 1.3349e-01, 2.3262e-01, 4.1489e-02,
         2.1961e-02],
        [1.3074e-03, 2.0459e-02, 2.2361e-02, 7.9322e-03, 9.3218e-03, 1.1592e-03,
         2.0062e-01, 1.4653e-02, 3.2762e-03, 6.7759e-01, 3.0652e-02, 8.3924e-03,
         2.2784e-03],
        [4.3122e-04, 2.3136e-04, 4.3187e-04, 3.3006e-04, 4.0770e-04, 1.0055e-03,
         2.3426e-04, 2.6266e-04, 2.0117e-04, 5.2877e-04, 9.9558e-01, 1.7242e-04,
         1.8773e-04],
        [6.3269e-04, 5.1025e-04, 1.4010e-03, 9.7782e-04, 1.2521e-03, 1.3198e-03,
         9.9840e-04, 5.1457e-04, 5.3469e-04, 2.4918e-03, 9.8886e-01, 2.3907e-04,
         2.6835e-04],
        [8.0266e-04, 4.0940e-04, 1.4331e-03, 6.9817e-04, 8.8852e-04, 1.6783e-03,
         8.6840e-04, 3.0953e-04, 3.8064e-04, 1.3506e-03, 9.9067e-01, 3.3401e-04,
         1.7545e-04],
        [1.9906e-03, 1.2123e-03, 6.9652e-03, 1.4592e-03, 1.4350e-03, 3.0704e-03,

In [None]:
inputs