# Named Entity Recognition

## Set variables

In [51]:
file_name = "BiodivNER_DistilBERT_ver1.ipynb"
batch_size = 4
dataset = "BiodivNER"
data_directory = "../Datasets/NER/COPIOUS-txt/"
model_src = "distilbert-base-cased"
model_name = "biodivner_distilbert-base-cased" # (wandb project name)

## Install and import necessary libraries

In [52]:
# !pip install transformers
# !pip install datasets
# !pip install seqeval
# !pip install accelerate -U

In [53]:
import os
import shutil
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

## Set up Weights and Biases

In [54]:
os.environ["WANDB_NOTEBOOK_NAME"] = file_name

In [55]:
# !pip install wandb
import wandb
wandb.login()

True

Set up experiment and hyperparameters

In [56]:
wandb.init(
    project=model_name,
    config={
        "batch_size": batch_size,
        "dataset": dataset,
    },
)

## Convert the dataset to CoNLL2003 format

In [57]:
# def read_CoNLL2003_format(filename, idx=3):
#     """Read file in CoNLL-2003 shared task format"""

#     # read file
#     lines =  open(filename, encoding="utf-8").read().strip()

#     # find sentence-like boundaries
#     lines = lines.split("\n\n")

#      # split on newlines
#     lines = [line.split("\n") for line in lines]

#     # get tokens
#     tokens = [[l.split()[0] for l in line] for line in lines]

#     # get labels/tags
#     labels = [[l.split()[idx] for l in line] for line in lines]

#     #convert to df
#     data= {'tokens': tokens, 'labels': labels}
#     df=pd.DataFrame(data=data)

#     return df

In [58]:
# def flatten(l):
#     return [item for sublist in l for item in sublist]

In [59]:
# DATADIR = data_directory

# def get_data(trainfile=DATADIR + "train.txt",
#              devfile=DATADIR + "dev.txt",
#              testfile=DATADIR + "test.txt"):

#     train = read_CoNLL2003_format(trainfile, idx=3)
#     dev = read_CoNLL2003_format(devfile, idx=3)
#     print("Train data: %d sentences, %d tokens"%(len(train),len(flatten(train.tokens))))

#     print("Dev data: %d sentences, %d tokens"%(len(dev),len(flatten(dev.tokens))))

#     test = read_CoNLL2003_format(testfile, idx=3)
#     print("Test data: %d sentences, %d tokens"%(len(test),len(flatten(test.tokens))))

#     return train, test, dev

In [60]:
# train, test, dev = get_data()

# train_dataset = Dataset.from_pandas(train)
# dev_dataset = Dataset.from_pandas(dev)
# test_dataset = Dataset.from_pandas(test)

In [61]:
# print(train)

In [62]:
# print(train_dataset)

In [63]:
root_data_dir = "../Datasets/NER/BiodivNER/"

biodivner_dataset = "train"
train_csv_file_path = "train.csv"
val_csv_file_path = "dev.csv"
test_csv_file_path = "test.csv"

In [64]:
def loadData(csv_file_path):
  dataset_path = os.path.join(root_data_dir, csv_file_path)
  data = pd.read_csv(dataset_path, encoding="latin1")
  data = data.fillna(method="ffill")
  return data

In [65]:
data = loadData(train_csv_file_path)
val_data = loadData(val_csv_file_path)
test_data = loadData(test_csv_file_path)

  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")


In [66]:
data.head(105)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 0,Samplenr,O
1,Sentence: 0,Seedlingnr,O
2,Sentence: 0,Plot,O
3,Sentence: 0,Record,O
4,Sentence: 0,Date,O
...,...,...,...
100,Sentence: 1,",",O
101,Sentence: 1,frames,O
102,Sentence: 1,",",O
103,Sentence: 1,landing,O


In [67]:
import re

def convert(orig):

    df = pd.DataFrame(orig)

    # Extract the integer from 'Sentence #'
    df['Sentence #'] = df['Sentence #'].apply(lambda x: int(re.search(r'\d+', x).group()))

    # Add a column representing the original order
    df['Original Order'] = range(len(df))

    # Group by 'Sentence #' and aggregate 'Word' and 'Tag' into lists
    grouped = df.groupby('Sentence #').agg({'Word': list, 'Tag': list, 'Original Order': 'first'}).reset_index()

    # Sort the DataFrame based on the original order
    grouped = grouped.sort_values(by='Original Order').drop('Original Order', axis=1)

    grouped = grouped.rename(columns={'Word': 'tokens'})
    grouped = grouped.rename(columns={'Tag': 'labels'})
    grouped = grouped.drop('Sentence #', axis=1)
    # print(grouped)
    return grouped

# print(grouped)


In [68]:
train_df = convert(data)
val_df = convert(val_data)
test_df = convert(test_data)

In [69]:
for i in range(len(train_df)):
    # print(len(train_df['tokens'][i]), len(train_df['labels'][i]))
    if len(train_df['tokens'][i]) != len(train_df['labels'][i]):
        print("lol")
# print(train_df['tokens'])

In [70]:
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [71]:
print(train_dataset)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 1918
})


## Tokenize the dataset

In [72]:
label_list = ['B-Phenomena', 'I-Phenomena', 'B-Quality', 'I-Quality', 'B-Location', 'I-Location', 'B-Environment', 'I-Environment', 'B-Matter', 'I-Matter', 'B-Organism', 'I-Organism', 'O']
label2id = {k: v for v, k in enumerate(label_list)}
id2label = {v: k for v, k in enumerate(label_list)}
print(label2id)
print(id2label)

{'B-Phenomena': 0, 'I-Phenomena': 1, 'B-Quality': 2, 'I-Quality': 3, 'B-Location': 4, 'I-Location': 5, 'B-Environment': 6, 'I-Environment': 7, 'B-Matter': 8, 'I-Matter': 9, 'B-Organism': 10, 'I-Organism': 11, 'O': 12}
{0: 'B-Phenomena', 1: 'I-Phenomena', 2: 'B-Quality', 3: 'I-Quality', 4: 'B-Location', 5: 'I-Location', 6: 'B-Environment', 7: 'I-Environment', 8: 'B-Matter', 9: 'I-Matter', 10: 'B-Organism', 11: 'I-Organism', 12: 'O'}


In [73]:
task = "ner"
model_checkpoint = model_src

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [74]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), max_length= 512, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_tokenized_datasets = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1918 [00:00<?, ? examples/s]

Map: 100%|██████████| 1918/1918 [00:01<00:00, 1686.37 examples/s]
Map: 100%|██████████| 240/240 [00:00<00:00, 1275.45 examples/s]
Map: 100%|██████████| 240/240 [00:00<00:00, 1103.98 examples/s]


In [75]:
print(train_tokenized_datasets)
print(len(train_tokenized_datasets['tokens'][0]))
print(len(train_tokenized_datasets['labels'][0]))

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 1918
})
97
180


## Finetuning the model to the dataset

In [76]:
model =  AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label, label2id=label2id)

args = TrainingArguments(
    f"test-{task}",
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = "epoch",
    num_train_epochs=3
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Clear model directory if it hasn't been cleared yet.

In [77]:
directory_path = "./ner.model"

if os.path.exists(directory_path):
    try:
        # Remove the directory
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' removed successfully.")
    except Exception as e:
        print(f"Error removing directory '{directory_path}': {e}")
else:
    print(f"Directory '{directory_path}' does not exist.")

Directory './ner.model' does not exist.


In [78]:
trainer.train()
trainer.evaluate()
trainer.save_model('ner.model')

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3505,0.236896,0.49551,0.503734,0.499588,0.928923
2,0.164,0.171418,0.614844,0.653112,0.6334,0.94241
3,0.0963,0.136327,0.689015,0.707884,0.698322,0.954279




## Making an inference with the finetuned model

In [79]:
from transformers import pipeline

Change model_checkpoint as needed

In [80]:
model_checkpoint = "./ner.model"

# artifact = wandb.use_artifact("electra-small-discriminator:latest")
# model_checkpoint = artifact.download()

In [81]:
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")

In [82]:
token_classifier("Birgus latro is widely distributed throughout the Western Pacific and eastern Indian Oceans")

[]

## Get precision, f1-score, and recall for each entity group

In [83]:
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'Environment': {'precision': 0.6133333333333333,
  'recall': 0.6540284360189573,
  'f1': 0.6330275229357798,
  'number': 211},
 'Location': {'precision': 0.3958333333333333,
  'recall': 0.3275862068965517,
  'f1': 0.3584905660377358,
  'number': 58},
 'Matter': {'precision': 0.6787878787878788,
  'recall': 0.4444444444444444,
  'f1': 0.5371702637889687,
  'number': 252},
 'Organism': {'precision': 0.828665568369028,
  'recall': 0.8793706293706294,
  'f1': 0.8532654792196777,
  'number': 572},
 'Phenomena': {'precision': 0.8072289156626506,
  'recall': 0.6504854368932039,
  'f1': 0.7204301075268817,
  'number': 103},
 'Quality': {'precision': 0.7173295454545454,
  'recall': 0.7902973395931142,
  'f1': 0.7520476545048398,
  'number': 639},
 'overall_precision': 0.7336244541484717,
 'overall_recall': 0.732425068119891,
 'overall_f1': 0.7330242705208617,
 'overall_accuracy': 0.9379174049338966}

In [84]:
artifact = wandb.Artifact(name=model_name, type="model")
artifact.add_dir(local_path="./ner.model")  # Add dataset directory to artifact
wandb.log(results, commit=True)
wandb.log_artifact(artifact)  # Logs the artifact version "my_data:v0"

[34m[1mwandb[0m: Adding directory to artifact (./ner.model)... 

Done. 3.0s


<Artifact biodivner_distilbert-base-cased>

In [85]:
wandb.finish()

0,1
eval/accuracy,▁▅██
eval/f1,▁▆██
eval/loss,█▃▁▁
eval/precision,▁▅██
eval/recall,▁▆██
eval/runtime,█▇▄▁
eval/samples_per_second,▁▂▅█
eval/steps_per_second,▁▂▅█
overall_accuracy,▁
overall_f1,▁

0,1
eval/accuracy,0.95428
eval/f1,0.69832
eval/loss,0.13633
eval/precision,0.68901
eval/recall,0.70788
eval/runtime,1.7691
eval/samples_per_second,135.665
eval/steps_per_second,1.131
overall_accuracy,0.93792
overall_f1,0.73302
