# Named Entity Recognition

## Set variables

In [2]:
file_name = "BiodivNER_Funnel_ver1.ipynb"
batch_size = 2
dataset = "BiodivNER"
data_directory = "../Datasets/NER/COPIOUS-txt/"
model_src = "funnel-transformer/small"
model_name = "BiodivNER_funnel-transformer-small" # (wandb project name)

## Install and import necessary libraries

In [3]:
# !pip install transformers
# !pip install datasets
# !pip install seqeval
# !pip install accelerate -U

In [4]:
import os
import shutil
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Set up Weights and Biases

In [5]:
os.environ["WANDB_NOTEBOOK_NAME"] = file_name

In [6]:
# !pip install wandb
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mshannen[0m ([33mshannen-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

Set up experiment and hyperparameters

In [7]:
wandb.init(
    project=model_name,
    config={
        "batch_size": batch_size,
        "dataset": dataset,
    },
)

## Convert the dataset to CoNLL2003 format

In [8]:
root_data_dir = "../Datasets/NER/BiodivNER/"

biodivner_dataset = "train"
train_csv_file_path = "train.csv"
val_csv_file_path = "dev.csv"
test_csv_file_path = "test.csv"

In [9]:
def loadData(csv_file_path):
  dataset_path = os.path.join(root_data_dir, csv_file_path)
  data = pd.read_csv(dataset_path, encoding="latin1")
  data = data.fillna(method="ffill")
  return data

In [10]:
data = loadData(train_csv_file_path)
val_data = loadData(val_csv_file_path)
test_data = loadData(test_csv_file_path)

  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")


In [11]:
data.head(105)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 0,Samplenr,O
1,Sentence: 0,Seedlingnr,O
2,Sentence: 0,Plot,O
3,Sentence: 0,Record,O
4,Sentence: 0,Date,O
...,...,...,...
100,Sentence: 1,",",O
101,Sentence: 1,frames,O
102,Sentence: 1,",",O
103,Sentence: 1,landing,O


In [12]:
import re

def convert(orig):

    df = pd.DataFrame(orig)

    # Extract the integer from 'Sentence #'
    df['Sentence #'] = df['Sentence #'].apply(lambda x: int(re.search(r'\d+', x).group()))

    # Add a column representing the original order
    df['Original Order'] = range(len(df))

    # Group by 'Sentence #' and aggregate 'Word' and 'Tag' into lists
    grouped = df.groupby('Sentence #').agg({'Word': list, 'Tag': list, 'Original Order': 'first'}).reset_index()

    # Sort the DataFrame based on the original order
    grouped = grouped.sort_values(by='Original Order').drop('Original Order', axis=1)

    grouped = grouped.rename(columns={'Word': 'tokens'})
    grouped = grouped.rename(columns={'Tag': 'labels'})
    grouped = grouped.drop('Sentence #', axis=1)
    # print(grouped)
    return grouped

# print(grouped)


In [13]:
train_df = convert(data)
val_df = convert(val_data)
test_df = convert(test_data)

In [14]:
for i in range(len(train_df)):
    # print(len(train_df['tokens'][i]), len(train_df['labels'][i]))
    if len(train_df['tokens'][i]) != len(train_df['labels'][i]):
        print("lol")
# print(train_df['tokens'])

In [15]:
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [16]:
print(train_dataset)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 1918
})


## Tokenize the dataset

In [17]:
label_list = ['B-Phenomena', 'I-Phenomena', 'B-Quality', 'I-Quality', 'B-Location', 'I-Location', 'B-Environment', 'I-Environment', 'B-Matter', 'I-Matter', 'B-Organism', 'I-Organism', 'O']
label2id = {k: v for v, k in enumerate(label_list)}
id2label = {v: k for v, k in enumerate(label_list)}
print(label2id)
print(id2label)

{'B-Phenomena': 0, 'I-Phenomena': 1, 'B-Quality': 2, 'I-Quality': 3, 'B-Location': 4, 'I-Location': 5, 'B-Environment': 6, 'I-Environment': 7, 'B-Matter': 8, 'I-Matter': 9, 'B-Organism': 10, 'I-Organism': 11, 'O': 12}
{0: 'B-Phenomena', 1: 'I-Phenomena', 2: 'B-Quality', 3: 'I-Quality', 4: 'B-Location', 5: 'I-Location', 6: 'B-Environment', 7: 'I-Environment', 8: 'B-Matter', 9: 'I-Matter', 10: 'B-Organism', 11: 'I-Organism', 12: 'O'}


In [18]:
task = "ner"
model_checkpoint = model_src

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json: 100%|██████████| 153/153 [00:00<00:00, 238kB/s]
config.json: 100%|██████████| 700/700 [00:00<00:00, 1.14MB/s]
vocab.txt: 100%|██████████| 231k/231k [00:00<00:00, 584kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 787kB/s]
special_tokens_map.json: 100%|██████████| 153/153 [00:00<00:00, 259kB/s]


In [19]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), max_length= 512, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_tokenized_datasets = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1918 [00:00<?, ? examples/s]

Map: 100%|██████████| 1918/1918 [00:01<00:00, 1573.91 examples/s]
Map: 100%|██████████| 240/240 [00:00<00:00, 1423.26 examples/s]
Map: 100%|██████████| 240/240 [00:00<00:00, 1280.03 examples/s]


In [20]:
print(train_tokenized_datasets)
print(len(train_tokenized_datasets['tokens'][0]))
print(len(train_tokenized_datasets['labels'][0]))

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1918
})
97
167


## Finetuning the model to the dataset

In [21]:
model =  AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label, label2id=label2id)

args = TrainingArguments(
    f"test-{task}",
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = "epoch",
    num_train_epochs=3
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

pytorch_model.bin: 100%|██████████| 524M/524M [00:35<00:00, 14.8MB/s] 
Some weights of FunnelForTokenClassification were not initialized from the model checkpoint at funnel-transformer/small and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("seqeval")


Clear model directory if it hasn't been cleared yet.

In [22]:
directory_path = "./ner.model"

if os.path.exists(directory_path):
    try:
        # Remove the directory
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' removed successfully.")
    except Exception as e:
        print(f"Error removing directory '{directory_path}': {e}")
else:
    print(f"Directory '{directory_path}' does not exist.")

Directory './ner.model' does not exist.


In [23]:
trainer.train()
trainer.evaluate()
trainer.save_model('ner.model')

You're using a FunnelTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
    output = module(*input, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py", line 1485, in forward
    outputs = self.funnel(
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py", line 1063, in forward
    encoder_outputs = self.encoder(
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py", line 681, in forward
    layer_output = layer(query, key, value, attention_inputs, output_attentions=output_attentions)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py", line 628, in forward
    attn = self.attention(query, key, value, attention_inputs, output_attentions=output_attentions)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py", line 569, in forward
    token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)
  File "/home/nlpbiodiv2023/.local/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py", line 533, in relative_token_type_attention
    token_type_attn = torch.where(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 672.00 MiB. GPU 0 has a total capacty of 11.76 GiB of which 208.81 MiB is free. Process 1573421 has 6.39 GiB memory in use. Including non-PyTorch memory, this process has 5.13 GiB memory in use. Of the allocated memory 3.81 GiB is allocated by PyTorch, and 399.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


## Making an inference with the finetuned model

In [None]:
from transformers import pipeline

Change model_checkpoint as needed

In [None]:
model_checkpoint = "./ner.model"

# artifact = wandb.use_artifact("electra-small-discriminator:latest")
# model_checkpoint = artifact.download()

In [None]:
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")

In [None]:
token_classifier("Birgus latro is widely distributed throughout the Western Pacific and eastern Indian Oceans")

[{'entity_group': 'Organism',
  'score': 0.56286,
  'word': 'birgus latro',
  'start': 0,
  'end': 12}]

## Get precision, f1-score, and recall for each entity group

In [None]:
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'Environment': {'precision': 0.6153846153846154,
  'recall': 0.7120418848167539,
  'f1': 0.6601941747572816,
  'number': 191},
 'Location': {'precision': 0.3137254901960784,
  'recall': 0.5161290322580645,
  'f1': 0.3902439024390244,
  'number': 62},
 'Matter': {'precision': 0.6901408450704225,
  'recall': 0.4666666666666667,
  'f1': 0.5568181818181818,
  'number': 210},
 'Organism': {'precision': 0.7325038880248833,
  'recall': 0.8658088235294118,
  'f1': 0.7935973041280541,
  'number': 544},
 'Phenomena': {'precision': 0.704225352112676,
  'recall': 0.625,
  'f1': 0.662251655629139,
  'number': 80},
 'Quality': {'precision': 0.7223230490018149,
  'recall': 0.804040404040404,
  'f1': 0.7609942638623328,
  'number': 495},
 'overall_precision': 0.684971098265896,
 'overall_recall': 0.7490518331226296,
 'overall_f1': 0.7155797101449275,
 'overall_accuracy': 0.9359037849310223}

In [None]:
artifact = wandb.Artifact(name=model_name, type="model")
artifact.add_dir(local_path="./ner.model")  # Add dataset directory to artifact
wandb.log(results, commit=True)
wandb.log_artifact(artifact)  # Logs the artifact version "my_data:v0"

[34m[1mwandb[0m: Adding directory to artifact (./ner.model)... 

Done. 4.6s


<Artifact BiodivNER_electra-base-discriminator>

In [None]:
wandb.finish()

0,1
eval/accuracy,▁▅██
eval/f1,▁▅██
eval/loss,█▃▁▁
eval/precision,▁▄██
eval/recall,▁▆██
eval/runtime,▁█▃▇
eval/samples_per_second,█▁▆▂
eval/steps_per_second,█▁▆▂
overall_accuracy,▁
overall_f1,▁

0,1
eval/accuracy,0.95469
eval/f1,0.70715
eval/loss,0.13362
eval/precision,0.67628
eval/recall,0.74097
eval/runtime,2.8588
eval/samples_per_second,83.95
eval/steps_per_second,0.7
overall_accuracy,0.9359
overall_f1,0.71558
