# Named Entity Recognition

In [1]:
file_name = "COPIOUS_MPNet.ipynb"
batch_size = 16
dataset = "COPIOUS"
data_directory = "../Datasets/NER/COPIOUS-txt/"
model_src = "sentence-transformers/all-mpnet-base-v2"
model_name = "all-mpnet-base-v2" # (wandb project name)

## Install and import necessary libraries

In [18]:
# !pip install transformers
# !pip install datasets
# !pip install seqeval
# !pip install accelerate -U

In [2]:
import os
import shutil
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Set up Weights and Biases

In [3]:
os.environ["WANDB_NOTEBOOK_NAME"] = file_name

In [4]:
# !pip install wandb
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mshannen[0m ([33mshannen-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

Set up experiment and hyperparameters

In [5]:
wandb.init(
    project=model_name,
    config={
        "batch_size": batch_size,
        "dataset": dataset,
    },
)

## Convert the dataset to CoNLL2003 format

In [6]:
def read_CoNLL2003_format(filename, idx=3):
    """Read file in CoNLL-2003 shared task format"""

    # read file
    lines =  open(filename).read().strip()

    # find sentence-like boundaries
    lines = lines.split("\n\n")

     # split on newlines
    lines = [line.split("\n") for line in lines]

    # get tokens
    tokens = [[l.split()[0] for l in line] for line in lines]

    # get labels/tags
    labels = [[l.split()[idx] for l in line] for line in lines]

    #convert to df
    data= {'tokens': tokens, 'labels': labels}
    df=pd.DataFrame(data=data)

    return df

In [7]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [8]:
DATADIR = data_directory

def get_data(trainfile=DATADIR + "train.txt",
             devfile=DATADIR + "dev.txt",
             testfile=DATADIR + "test.txt"):

    train = read_CoNLL2003_format(trainfile, idx=3)
    dev = read_CoNLL2003_format(devfile, idx=3)
    print("Train data: %d sentences, %d tokens"%(len(train),len(flatten(train.tokens))))

    print("Dev data: %d sentences, %d tokens"%(len(dev),len(flatten(dev.tokens))))

    test = read_CoNLL2003_format(testfile, idx=3)
    print("Test data: %d sentences, %d tokens"%(len(test),len(flatten(test.tokens))))

    return train, test, dev

In [9]:
train, test, dev = get_data()

train_dataset = Dataset.from_pandas(train)
dev_dataset = Dataset.from_pandas(dev)
test_dataset = Dataset.from_pandas(test)

Train data: 23695 sentences, 313311 tokens
Dev data: 4101 sentences, 37218 tokens
Test data: 3362 sentences, 37339 tokens


## Tokenize the dataset

In [10]:
label_list = ['B-GeographicalLocation', 'B-Habitat', 'B-Person', 'B-Taxon', 'B-TemporalExpression', 'I-GeographicalLocation', 'I-Habitat', 'I-Person', 'I-Taxon', 'I-TemporalExpression', 'O']
label2id = {k: v for v, k in enumerate(label_list)}
id2label = {v: k for v, k in enumerate(label_list)}
label2id

{'B-GeographicalLocation': 0,
 'B-Habitat': 1,
 'B-Person': 2,
 'B-Taxon': 3,
 'B-TemporalExpression': 4,
 'I-GeographicalLocation': 5,
 'I-Habitat': 6,
 'I-Person': 7,
 'I-Taxon': 8,
 'I-TemporalExpression': 9,
 'O': 10}

In [11]:
task = "ner"
model_checkpoint = model_src

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [12]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), max_length= 512, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_tokenized_datasets = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/23695 [00:00<?, ? examples/s]

Map: 100%|██████████| 23695/23695 [00:05<00:00, 4576.63 examples/s]
Map: 100%|██████████| 4101/4101 [00:00<00:00, 6668.31 examples/s]
Map: 100%|██████████| 3362/3362 [00:00<00:00, 4920.56 examples/s]


## Finetuning the model to the dataset

In [13]:
model =  AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label, label2id=label2id)

args = TrainingArguments(
    f"test-{task}",
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = "epoch",
    num_train_epochs=3
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of MPNetForTokenClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("seqeval")


Clear model directory if it hasn't been cleared yet.

In [14]:
directory_path = "./ner.model"

if os.path.exists(directory_path):
    try:
        # Remove the directory
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' removed successfully.")
    except Exception as e:
        print(f"Error removing directory '{directory_path}': {e}")
else:
    print(f"Directory '{directory_path}' does not exist.")

Directory './ner.model' removed successfully.


In [15]:
trainer.train()
trainer.evaluate()
trainer.save_model('ner.model')

You're using a MPNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2542,0.295188,0.686287,0.81667,0.745823,0.935425
2,0.1825,0.184522,0.804621,0.851692,0.827488,0.956113
3,0.0976,0.18593,0.804882,0.86441,0.833585,0.956815




## Making an inference with the finetuned model

In [16]:
from transformers import pipeline

Change model_checkpoint as needed

In [17]:
model_checkpoint = "./ner.model"

# artifact = wandb.use_artifact("electra-small-discriminator:latest")
# model_checkpoint = artifact.download()

In [18]:
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")

In [19]:
token_classifier("Birgus latro is widely distributed throughout the Western Pacific and eastern Indian Oceans")

[{'entity_group': 'Taxon',
  'score': 0.9597275,
  'word': 'birgus latro',
  'start': 0,
  'end': 12},
 {'entity_group': 'GeographicalLocation',
  'score': 0.92625546,
  'word': 'western pacific',
  'start': 50,
  'end': 65},
 {'entity_group': 'GeographicalLocation',
  'score': 0.92209864,
  'word': 'eastern indian oceans',
  'start': 70,
  'end': 91}]

## Get precision, f1-score, and recall for each entity group

In [20]:
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'GeographicalLocation': {'precision': 0.8236933797909408,
  'recall': 0.8788104089219331,
  'f1': 0.8503597122302159,
  'number': 1345},
 'Habitat': {'precision': 0.6666666666666666,
  'recall': 0.6808510638297872,
  'f1': 0.6736842105263158,
  'number': 188},
 'Person': {'precision': 0.7183098591549296,
  'recall': 0.6609071274298056,
  'f1': 0.688413948256468,
  'number': 463},
 'Taxon': {'precision': 0.8419464353074312,
  'recall': 0.8545176110260337,
  'f1': 0.8481854455633671,
  'number': 2612},
 'TemporalExpression': {'precision': 0.7370242214532872,
  'recall': 0.8255813953488372,
  'f1': 0.7787934186471663,
  'number': 258},
 'overall_precision': 0.8133386741438013,
 'overall_recall': 0.8345663789560214,
 'overall_f1': 0.8238158028197586,
 'overall_accuracy': 0.9557662274816884}

In [21]:
artifact = wandb.Artifact(name=model_name, type="model")
artifact.add_dir(local_path="./ner.model")  # Add dataset directory to artifact
wandb.log(results, commit=True)
wandb.log_artifact(artifact)  # Logs the artifact version "my_data:v0"

[34m[1mwandb[0m: Adding directory to artifact (./ner.model)... 

Done. 4.7s


<Artifact all-mpnet-base-v2>

In [22]:
wandb.finish()

0,1
eval/accuracy,▁███
eval/f1,▁███
eval/loss,█▁▁▁
eval/precision,▁███
eval/recall,▁▆██
eval/runtime,█▁▂▃
eval/samples_per_second,▁█▇▆
eval/steps_per_second,▁█▇▆
overall_accuracy,▁
overall_f1,▁

0,1
eval/accuracy,0.95682
eval/f1,0.83358
eval/loss,0.18593
eval/precision,0.80488
eval/recall,0.86441
eval/runtime,12.9855
eval/samples_per_second,315.814
eval/steps_per_second,2.541
overall_accuracy,0.95577
overall_f1,0.82382
