<a href="https://colab.research.google.com/github/polarboar/NER-on-MultiNERD/blob/main/RISE_Interview_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers
!pip install transformers[torch]
!pip install evaluate
!pip install seqeval
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.24.1
    Uninstalling accelerate-0.24.1:
      Successfully uninstalled accelerate-0.24.1
Successfully installed accelerate-0.25.0


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
dataset = load_dataset('Babelscape/multinerd')

print(dataset['train'][0])
label_to_id = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }

id_to_label = { label_to_id[key]: key for key in label_to_id }

# Only get english data
dataset = dataset.filter(lambda x: x['lang'] == 'en')

# Keep only required classes
# PERSON(PER) 1&2, ORGANIZATION(ORG) 3&4, LOCATION(LOC) 5&6, DISEASES(DIS) 13&14, ANIMAL(ANIM) 7&8, Other(O) 0
required_classes = [1,2,3,4,5,6,7,8,13,14]
def remove_classes(row):
  row['ner_tags'] = [tag if tag in required_classes else 0 for tag in row['ner_tags']]
  return row
dataset = dataset.map(remove_classes)

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

{'tokens': ['2002', 'ging', 'er', 'ins', 'Ausland', 'und', 'wechselte', 'für', '750.000', 'Pfund', 'Sterling', 'zu', 'Manchester', 'City', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0], 'lang': 'de'}


In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_input_and_add_labels(input):
  labels = []
  tokens = tokenizer(input['tokens'], truncation=True, is_split_into_words=True)
  word_ids = tokens.word_ids()
  seen = {}
  for word_id in word_ids:
    if word_id == None:
      labels.append(-100)
    elif word_id in seen:
      labels.append(-100)
    else:
      labels.append(input['ner_tags'][word_id])
      seen[word_id] = True
  tokens['labels'] = labels
  return tokens

tokenized_dataset = dataset.map(tokenize_input_and_add_labels)

print(dataset['train'][0])
print(tokenized_dataset['train'][0])

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  true_predictions = [
      [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  results = seqeval.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      "f1": results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }

seqeval = evaluate.load('seqeval')

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id_to_label), id2label=id_to_label, label2id=label_to_id
)

training_args = TrainingArguments(
    output_dir = 'test_model',
    num_train_epochs = 2,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    push_to_hub = False
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    )

Map:   0%|          | 0/32820 [00:00<?, ? examples/s]

{'tokens': ['The', 'type', 'locality', 'is', 'Kīlauea', '.'], 'ner_tags': [0, 0, 0, 0, 5, 0], 'lang': 'en'}
{'tokens': ['The', 'type', 'locality', 'is', 'Kīlauea', '.'], 'ner_tags': [0, 0, 0, 0, 5, 0], 'lang': 'en', 'input_ids': [101, 1996, 2828, 10246, 2003, 11382, 17298, 5243, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 5, -100, -100, 0, -100]}


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0141,0.031641,0.926654,0.941978,0.934253,0.990921


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0141,0.031641,0.926654,0.941978,0.934253,0.990921
2,0.0035,0.040214,0.940534,0.942647,0.941589,0.991702


TrainOutput(global_step=65640, training_loss=0.0167922725788636, metrics={'train_runtime': 3878.4301, 'train_samples_per_second': 135.395, 'train_steps_per_second': 16.924, 'total_flos': 6439087044066768.0, 'train_loss': 0.0167922725788636, 'epoch': 2.0})

In [None]:
trainer.evaluate(eval_dataset=tokenized_dataset['test'])

{'eval_loss': 0.03330770507454872,
 'eval_precision': 0.9319250833832208,
 'eval_recall': 0.94917468751361,
 'eval_f1': 0.9404707963838005,
 'eval_accuracy': 0.9907297745119228,
 'eval_runtime': 89.0708,
 'eval_samples_per_second': 369.459,
 'eval_steps_per_second': 46.188,
 'epoch': 2.0}