# Named Entity Recognition

## Set variables

In [1]:
file_name = "BiodivNER_MPNet_ver2.ipynb"
batch_size = 4
dataset = "BiodivNER"
data_directory = "../Datasets/NER/BiodivNER/"
model_src = "sentence-transformers/all-mpnet-base-v2"
model_name = "biodivner_all-mpnet-base" # (wandb project name)

## Install and import necessary libraries

In [2]:
# !pip install transformers
# !pip install datasets
# !pip install seqeval
# !pip install accelerate -U

In [2]:
import os
import shutil
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Set up Weights and Biases

In [3]:
os.environ["WANDB_NOTEBOOK_NAME"] = file_name

In [4]:
# !pip install wandb
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mshannen[0m ([33mshannen-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

Set up experiment and hyperparameters

In [5]:
wandb.init(
    project=model_name,
    config={
        "batch_size": batch_size,
        "dataset": dataset,
    },
)

## Convert the dataset to CoNLL2003 format

In [6]:
root_data_dir = data_directory

biodivner_dataset = "train"
train_csv_file_path = "train.csv"
val_csv_file_path = "dev.csv"
test_csv_file_path = "test.csv"

In [7]:
def loadData(csv_file_path):
  dataset_path = os.path.join(root_data_dir, csv_file_path)
  data = pd.read_csv(dataset_path, encoding="latin1")
  data = data.fillna(method="ffill")
  return data

In [8]:
data = loadData(train_csv_file_path)
val_data = loadData(val_csv_file_path)
test_data = loadData(test_csv_file_path)

  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")


In [9]:
label_list = ['B-Phenomena', 'I-Phenomena', 'B-Quality', 'I-Quality', 'B-Location', 'I-Location', 'B-Environment', 'I-Environment', 'B-Matter', 'I-Matter', 'B-Organism', 'I-Organism', 'O']
label2id = {k: v for v, k in enumerate(label_list)}
id2label = {v: k for v, k in enumerate(label_list)}
print(label2id)
print(id2label)

{'B-Phenomena': 0, 'I-Phenomena': 1, 'B-Quality': 2, 'I-Quality': 3, 'B-Location': 4, 'I-Location': 5, 'B-Environment': 6, 'I-Environment': 7, 'B-Matter': 8, 'I-Matter': 9, 'B-Organism': 10, 'I-Organism': 11, 'O': 12}
{0: 'B-Phenomena', 1: 'I-Phenomena', 2: 'B-Quality', 3: 'I-Quality', 4: 'B-Location', 5: 'I-Location', 6: 'B-Environment', 7: 'I-Environment', 8: 'B-Matter', 9: 'I-Matter', 10: 'B-Organism', 11: 'I-Organism', 12: 'O'}


In [10]:
import re

def convert(orig):

    df = pd.DataFrame(orig)

    # Extract the integer from 'Sentence #'
    df['Sentence #'] = df['Sentence #'].apply(lambda x: int(re.search(r'\d+', x).group()))

    # Add a column representing the original order
    df['Original Order'] = range(len(df))

    # Group by 'Sentence #' and aggregate 'Word' and 'Tag' into lists
    grouped = df.groupby('Sentence #').agg({'Word': list, 'Tag': list, 'Original Order': 'first'}).reset_index()

    # Sort the DataFrame based on the original order
    grouped = grouped.sort_values(by='Original Order').drop('Original Order', axis=1)

    grouped = grouped.rename(columns={'Word': 'tokens'})
    grouped = grouped.rename(columns={'Tag': 'labels'})
    grouped = grouped.drop('Sentence #', axis=1)
    # print(grouped)
    grouped['labels'] = grouped['labels'].apply(lambda x: [label2id[label] for label in x])
    return grouped

# print(grouped)


In [11]:
train_df = convert(data)
val_df = convert(val_data)
test_df = convert(test_data)

In [12]:
for i in range(len(train_df)):
    # print(len(train_df['tokens'][i]), len(train_df['labels'][i]))
    if len(train_df['tokens'][i]) != len(train_df['labels'][i]):
        print("lol")
# print(train_df['tokens'])

In [13]:
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [14]:
print(train_dataset)
print(train_dataset[0])

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 1918
})
{'tokens': ['Samplenr', 'Seedlingnr', 'Plot', 'Record', 'Date', 'Planted_Species', 'Density', 'Treatment', 'Dead', 'Height_P', 'Height_G', 'Leaves_Liv', 'Leaves_Dam', 'Leaves_Dead', 'Damage_pro', 'Biomass_Above', 'Biomass_Below', 'List', 'of', 'headers', 'of', 'the', 'data', 'columns', 'in', 'this', 'dataset', 'Pilot', 'experiment', '117.8998', '118.1483', '29.2852', '29.10178', '########', '########', 'markus_ger', 'erfmeier', 'Common', 'Garden', 'Experiment', ':', 'Seedling', 'addition', 'experiment', '-', 'growth', 'and', 'biomass', 'data', 'While', 'coexistence', 'in', 'plant', 'communities', 'is', 'frequently', 'explained', 'by', 'effects', 'of', 'resource', 'niche', 'partitioning', ',', 'the', 'Janzen-Connell', '(', 'J-C', ')', 'hypothesis', 'is', 'an', 'alternative', 'approach', 'that', 'has', 'been', 'assumed', 'as', 'a', 'major', 'ecological', 'mechanism', 'explaining', 'high', 'species', 'richness', 'levels',

## Tokenize the dataset

In [15]:
label_list = ['B-Phenomena', 'I-Phenomena', 'B-Quality', 'I-Quality', 'B-Location', 'I-Location', 'B-Environment', 'I-Environment', 'B-Matter', 'I-Matter', 'B-Organism', 'I-Organism', 'O']


In [16]:
task = "ner"
model_checkpoint = model_src

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 805kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 823kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 707kB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 578kB/s]


In [17]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [18]:
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_tokenized_datasets = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 1918/1918 [00:00<00:00, 2042.37 examples/s]
Map: 100%|██████████| 240/240 [00:00<00:00, 1281.52 examples/s]
Map: 100%|██████████| 240/240 [00:00<00:00, 1243.42 examples/s]


In [19]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
import evaluate

seqeval = evaluate.load("seqeval")

In [21]:
import numpy as np

# labels = [label_list[i] for i in example[f"ner_tags"]]




In [22]:
model =  AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label, label2id=label2id)

args = TrainingArguments(
    f"test-{task}",
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = "epoch",
    num_train_epochs=3
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

config.json: 100%|██████████| 571/571 [00:00<00:00, 760kB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [00:27<00:00, 16.0MB/s] 
Some weights of MPNetForTokenClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("seqeval")


Clear model directory if it hasn't been cleared yet.

In [23]:
directory_path = "./ner.model"

if os.path.exists(directory_path):
    try:
        # Remove the directory
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' removed successfully.")
    except Exception as e:
        print(f"Error removing directory '{directory_path}': {e}")
else:
    print(f"Directory '{directory_path}' does not exist.")

Directory './ner.model' does not exist.


In [24]:
trainer.train()
trainer.evaluate()
trainer.save_model('ner.model')

You're using a MPNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.9195,0.857559,0.0,0.0,0.0,0.87321
2,0.3854,0.359651,0.588448,0.528649,0.556948,0.931654
3,0.26,0.240789,0.675906,0.685405,0.680623,0.949342


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Making an inference with the finetuned model

In [25]:
from transformers import pipeline

Change model_checkpoint as needed

In [26]:
model_checkpoint = "./ner.model"

# artifact = wandb.use_artifact("electra-small-discriminator:latest")
# model_checkpoint = artifact.download()

In [27]:
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")

In [28]:
token_classifier("Birgus latro is widely distributed throughout the Western Pacific and eastern Indian Oceans")

[{'entity_group': 'Environment',
  'score': 0.2579761,
  'word': 'oceans',
  'start': 85,
  'end': 91}]

## Get precision, f1-score, and recall for each entity group

In [29]:
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'Environment': {'precision': 0.66,
  'recall': 0.7719298245614035,
  'f1': 0.711590296495957,
  'number': 171},
 'Location': {'precision': 0.5,
  'recall': 0.05263157894736842,
  'f1': 0.09523809523809525,
  'number': 38},
 'Matter': {'precision': 0.6987951807228916,
  'recall': 0.35365853658536583,
  'f1': 0.4696356275303643,
  'number': 164},
 'Organism': {'precision': 0.5752688172043011,
  'recall': 0.7016393442622951,
  'f1': 0.6322008862629247,
  'number': 305},
 'Phenomena': {'precision': 0.6888888888888889,
  'recall': 0.47692307692307695,
  'f1': 0.5636363636363636,
  'number': 65},
 'Quality': {'precision': 0.7017543859649122,
  'recall': 0.7655502392344498,
  'f1': 0.7322654462242564,
  'number': 418},
 'overall_precision': 0.6525862068965518,
 'overall_recall': 0.6520241171403962,
 'overall_f1': 0.6523050409306335,
 'overall_accuracy': 0.929120495158231}

In [30]:
artifact = wandb.Artifact(name=model_name, type="model")
artifact.add_dir(local_path="./ner.model")  # Add dataset directory to artifact
wandb.log(results, commit=True)
wandb.log_artifact(artifact)  # Logs the artifact version "my_data:v0"

[34m[1mwandb[0m: Adding directory to artifact (./ner.model)... Done. 5.1s


<Artifact all-mpnet-base>

In [31]:
wandb.finish()