<a href="https://colab.research.google.com/github/smf-9000/Named-Entity-Recognition/blob/main/NER_with_hf_transformer_%5BNER_start%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
Sentiment analysis, example from huggingface.org
[https://huggingface.co/transformers/custom_datasets.html]
```



In [None]:
!pip install pandas transformers

In [None]:
! wget http://noisy-text.github.io/2017/files/wnut17train.conll

In [2]:
from pathlib import Path
import re


In [None]:
file_path = Path("/content/wnut17train.conll")
raw_text = file_path.read_text().strip()
raw_docs = re.split(r'\n\t?\n', raw_text)

token_docs = []
tag_docs = []
for doc in raw_docs:
  tokens = []
  tags = []
  for line in doc.split('\n'):
    token, tag = line.split('\t')
    tokens.append(token)
    tags.append(tag)
  token_docs.append(tokens)
  tag_docs.append(tags)

print(token_docs[0])
print(tag_docs[0])

In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(token_docs, tag_docs, test_size=.2)

## encodings for our tokens and tags

In [5]:
unique_tags = set(tag for doc in tag_docs for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# print(tag2id)
# print(id2tag)

In [None]:
id2tag

In [9]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…






```
train_encodings={
  'input_ids': [...],
  'offset_mapping': [...]
}
```



In [10]:
import numpy as np

def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())
    
  return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
print(train_labels)

In [12]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [None]:
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

In [14]:
training_args = TrainingArguments(
  output_dir='./results',          # output directory
  num_train_epochs=5,              # total number of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=64,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_dir='./logs',            # directory for storing logs
  logging_steps=10,
  evaluation_strategy='epoch'
)


In [15]:
trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 2715
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 850


Epoch,Training Loss,Validation Loss
1,0.1566,0.197719
2,0.1332,0.136935
3,0.0906,0.126668
4,0.0551,0.137188
5,0.0098,0.146733


***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64
***** Running Evaluation *****
  Num examples = 679
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=850, training_loss=0.2432579725717797, metrics={'train_runtime': 254.9069, 'train_samples_per_second': 53.255, 'train_steps_per_second': 3.335, 'total_flos': 477955263424500.0, 'train_loss': 0.2432579725717797, 'epoch': 5.0})

In [16]:
model.eval()
from torch.nn import functional as F

In [30]:
example = 'Microsoft moved its headquarters from Bellevue to Redmond, Washington, on February 26, 1986, and went public on March 13.'
# example = 'Microsoft released Microsoft Windows on November 20, 1985, as a graphical extension for MS-DOS'
# example = 'Huggingface is the best company.'

device = "cuda:0"
with torch.no_grad():
  inputs = tokenizer(example, return_tensors="pt").to(device)
  outputs = model(**inputs)
  softmax = F.softmax(outputs[0][0], dim = 1)
  indices = [x.argmax().item() for x in softmax]
  input_list = inputs['input_ids'].tolist()[0]
  # print(outputs[0][0])
  # print(softmax)
  print('tags:', id2tag)
  print('pred_ids:', indices)
  print('input_tokens', input_list)
  # print(inputs['input_ids'].tolist()[0])

  word2tokens = {x : tokenizer.encode(x, add_special_tokens=False) for x in example.split()}

  # print(word2tokens)
  # print(tokenizer.decode(101))

  for i in range(len(indices)):
    print('token: ', input_list[i], '\ttag: ', id2tag[indices[i]], '  ', 'entity: ', tokenizer.decode(input_list[i]))

tags: {0: 'I-product', 1: 'I-group', 2: 'B-person', 3: 'B-location', 4: 'I-person', 5: 'B-group', 6: 'O', 7: 'B-product', 8: 'I-creative-work', 9: 'I-corporation', 10: 'B-creative-work', 11: 'I-location', 12: 'B-corporation'}
pred_ids: [6, 12, 6, 6, 6, 6, 3, 11, 6, 3, 11, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
input_tokens [101, 6998, 1427, 1157, 3834, 1121, 10199, 19038, 1106, 2156, 10151, 117, 1994, 117, 1113, 1428, 1744, 117, 2177, 117, 1105, 1355, 1470, 1113, 1345, 1492, 119, 102]
token:  101 	tag:  O    entity:  [CLS]
token:  6998 	tag:  B-corporation    entity:  Microsoft
token:  1427 	tag:  O    entity:  moved
token:  1157 	tag:  O    entity:  its
token:  3834 	tag:  O    entity:  headquarters
token:  1121 	tag:  O    entity:  from
token:  10199 	tag:  B-location    entity:  Belle
token:  19038 	tag:  I-location    entity:  ##vue
token:  1106 	tag:  O    entity:  to
token:  2156 	tag:  B-location    entity:  Red
token:  10151 	tag:  I-location    entity:  ##mond
toke

In [None]:
inputs