# Named Entity Recognition

## Set up Weights and Biases

In [3]:
# WANDB_NOTEBOOK_NAME = "ELECTRA.ipynb"

In [4]:
# # !pip install wandb
# import wandb
# wandb.login()

Set up experiment and hyperparameters

In [5]:
batch_size = 8

In [6]:
# wandb.init(
#     project="electra-small",
#     config={
#         "batch_size": batch_size,
#         "dataset": "COPIOUS",
#     },
# )

## Install and import necessary libraries

In [7]:
# !pip install transformers
# !pip install datasets
# !pip install seqeval
# !pip install accelerate -U
! pip install evaluate

Defaulting to user installation because normal site-packages is not writeable


In [8]:
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Convert the dataset to CoNLL2003 format

In [9]:
def read_CoNLL2003_format(filename, idx=3):
    """Read file in CoNLL-2003 shared task format"""

    # read file
    lines =  open(filename).read().strip()

    # find sentence-like boundaries
    lines = lines.split("\n\n")

     # split on newlines
    lines = [line.split("\n") for line in lines]

    # get tokens
    tokens = [[l.split()[0] for l in line] for line in lines]

    # get labels/tags
    labels = [[l.split()[idx] for l in line] for line in lines]

    #convert to df
    data= {'tokens': tokens, 'labels': labels}
    df=pd.DataFrame(data=data)

    return df

In [10]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [11]:
DATADIR = "../Datasets/NER/COPIOUS-txt/"

def get_data(trainfile=DATADIR + "train.txt",
             devfile=DATADIR + "dev.txt",
             testfile=DATADIR + "test.txt"):

    train = read_CoNLL2003_format(trainfile, idx=3)
    dev = read_CoNLL2003_format(devfile, idx=3)
    print("Train data: %d sentences, %d tokens"%(len(train),len(flatten(train.tokens))))

    print("Dev data: %d sentences, %d tokens"%(len(dev),len(flatten(dev.tokens))))

    test = read_CoNLL2003_format(testfile, idx=3)
    print("Test data: %d sentences, %d tokens"%(len(test),len(flatten(test.tokens))))

    return train, test, dev

In [12]:
train, test, dev = get_data()

train_dataset = Dataset.from_pandas(train)
dev_dataset = Dataset.from_pandas(dev)
test_dataset = Dataset.from_pandas(test)

Train data: 23695 sentences, 313311 tokens
Dev data: 4101 sentences, 37218 tokens
Test data: 3362 sentences, 37339 tokens


## Tokenize the dataset

In [13]:
label_list = ['B-GeographicalLocation', 'B-Habitat', 'B-Person', 'B-Taxon', 'B-TemporalExpression', 'I-GeographicalLocation', 'I-Habitat', 'I-Person', 'I-Taxon', 'I-TemporalExpression', 'O']
label2id = {k: v for v, k in enumerate(label_list)}
id2label = {v: k for v, k in enumerate(label_list)}
label2id

{'B-GeographicalLocation': 0,
 'B-Habitat': 1,
 'B-Person': 2,
 'B-Taxon': 3,
 'B-TemporalExpression': 4,
 'I-GeographicalLocation': 5,
 'I-Habitat': 6,
 'I-Person': 7,
 'I-Taxon': 8,
 'I-TemporalExpression': 9,
 'O': 10}

In [14]:
task = "ner"
model_checkpoint = "xlnet-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), max_length= 512, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["tokens"], max_length= 512, truncation=True, is_split_into_words=True)

#     labels = []
#     for i, label in enumerate(examples[f"labels"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_tokenized_datasets = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/23695 [00:00<?, ? examples/s]

Map: 100%|██████████| 23695/23695 [00:05<00:00, 4308.46 examples/s]
Map: 100%|██████████| 4101/4101 [00:00<00:00, 5023.60 examples/s]
Map: 100%|██████████| 3362/3362 [00:00<00:00, 3870.39 examples/s]


## Finetuning the model to the dataset

In [18]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script: 100%|██████████| 6.34k/6.34k [00:00<00:00, 7.09MB/s]


In [19]:
model =  AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label, label2id=label2id)

args = TrainingArguments(
    f"xlnet_test-{task}",
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = "epoch",
    num_train_epochs=1,
    save_total_limit= 2,
    save_strategy = "epoch",
    load_best_model_at_end = True
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # results = metric.compute(predictions=true_predictions, references=true_labels)
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("seqeval")


In [20]:
trainer.train()
trainer.evaluate()
trainer.save_model('xlnet_model')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshannen[0m ([33mshannen-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


RuntimeError: Input tensor at index 1 has invalid shape [60, 2, 768], but expected [60, 3, 768]

## Making an inference with the finetuned model

In [17]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./xlnet_model"
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")

In [18]:
token_classifier("Birgus latro is widely distributed throughout the Western Pacific and eastern Indian Oceans")

[{'entity_group': 'Taxon',
  'score': 0.97687423,
  'word': 'birgus latro',
  'start': 0,
  'end': 12},
 {'entity_group': 'GeographicalLocation',
  'score': 0.9754473,
  'word': 'western pacific',
  'start': 50,
  'end': 65},
 {'entity_group': 'GeographicalLocation',
  'score': 0.9732645,
  'word': 'eastern indian oceans',
  'start': 70,
  'end': 91}]

## Get precision, f1-score, and recall for each entity group

In [19]:
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'GeographicalLocation': {'precision': 0.7899328859060403,
  'recall': 0.875092936802974,
  'f1': 0.8303350970017637,
  'number': 1345},
 'Habitat': {'precision': 0.6048780487804878,
  'recall': 0.6595744680851063,
  'f1': 0.6310432569974553,
  'number': 188},
 'Person': {'precision': 0.6296296296296297,
  'recall': 0.5874730021598272,
  'f1': 0.6078212290502794,
  'number': 463},
 'Taxon': {'precision': 0.8411106826070189,
  'recall': 0.8349923430321593,
  'f1': 0.8380403458213257,
  'number': 2612},
 'TemporalExpression': {'precision': 0.722972972972973,
  'recall': 0.8294573643410853,
  'f1': 0.772563176895307,
  'number': 258},
 'overall_precision': 0.7910685805422647,
 'overall_recall': 0.8154541718043568,
 'overall_f1': 0.8030763003440599,
 'overall_accuracy': 0.9515650089083431}