# Fine-Tuning a BERT Encoder for Named Entity Recognition on CoNLL2003

Implemented for the NLP Cryptonite Research AI Taskphase by Pratham Shah - 240905614.

## Installing seqeval

In [16]:
!pip install seqeval



## Adding Imports

In [17]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import classification_report

## Defining Labels

In [18]:
NER_labels = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]
label_id_map = {label: i for i, label in enumerate(NER_labels)}
id_label_map = {i: label for label, i in label_id_map.items()}

## Load CoNLL-2003 Data

In [19]:
def read_conll_file(file_path):
    tokens, tags = [], []
    temp_tokens, temp_tags = [], []
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if temp_tokens:
                    tokens.append(temp_tokens)
                    tags.append(temp_tags)
                    temp_tokens, temp_tags = [], []
                continue
            word, pos, chunk, ner = line.split()
            temp_tokens.append(word)
            temp_tags.append(label_id_map[ner])
    return tokens, tags

train_tokens, train_tags = read_conll_file("train.txt")
val_tokens, val_tags = read_conll_file("valid.txt")

## Tokenizer, Alignment and Model

In [None]:
model = "bert-large-uncased"

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model)

def tokenize_and_align_labels(tokens_list, tags_list):
    encodings = {"input_ids": [], "attention_mask": [], "labels": []}

    for tokens, labels in zip(tokens_list, tags_list):
        tokenized = tokenizer(tokens, is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
        word_ids = tokenized.word_ids()

        aligned_labels = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != prev_word_idx:
                aligned_labels.append(labels[word_idx])
            else:
                aligned_labels.append(-100)
            prev_word_idx = word_idx

        encodings["input_ids"].append(tokenized["input_ids"])
        encodings["attention_mask"].append(tokenized["attention_mask"])
        encodings["labels"].append(aligned_labels)

    return encodings

train_enc = tokenize_and_align_labels(train_tokens, train_tags)
val_enc = tokenize_and_align_labels(val_tokens, val_tags)

## NER Dataset creation

In [21]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

train_dataset = NERDataset(train_enc)
val_dataset = NERDataset(val_enc)

## Loading the Model

In [22]:
model = AutoModelForTokenClassification.from_pretrained(model, num_labels=len(NER_labels))
data_collator = DataCollatorForTokenClassification(tokenizer)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Function for Metrics Calculation

In [23]:
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=-1)
    true_preds, true_labels = [], []
    for pred_seq, label_seq in zip(preds, labels):
        temp_preds, temp_labels = [], []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:
                temp_preds.append(id_label_map[p])
                temp_labels.append(id_label_map[l])
        true_preds.append(temp_preds)
        true_labels.append(temp_labels)
    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

## Creating Training Arguments

In [24]:
training_args = TrainingArguments(
    output_dir="./output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    report_to=[],
    num_train_epochs=4,
    weight_decay=0.01,
    bert_dropout_rate = 0.2,
    logging_dir="./logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    eval_strategy="steps",
    save_strategy="steps",
)

## Defining The Trainer

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


## Train and Evaluation

In [26]:
trainer.train()
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.1685,0.058752,0.911016,0.92002,0.915205
1000,0.0547,0.048024,0.919737,0.941404,0.930241
1500,0.0291,0.043323,0.938512,0.952349,0.945324
2000,0.0227,0.049622,0.944864,0.952181,0.948446
2500,0.0128,0.043856,0.947412,0.953696,0.95049
3000,0.0113,0.051002,0.943119,0.953864,0.948436


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.1685,0.058752,0.911016,0.92002,0.915205
1000,0.0547,0.048024,0.919737,0.941404,0.930241
1500,0.0291,0.043323,0.938512,0.952349,0.945324
2000,0.0227,0.049622,0.944864,0.952181,0.948446
2500,0.0128,0.043856,0.947412,0.953696,0.95049
3000,0.0113,0.051002,0.943119,0.953864,0.948436
3500,0.0068,0.047513,0.940695,0.953022,0.946803
4000,0.0053,0.050465,0.948189,0.95538,0.951769
4500,0.003,0.05066,0.946417,0.955548,0.950949


Evaluation Results: {'eval_loss': 0.050464510917663574, 'eval_precision': 0.9481892932699343, 'eval_recall': 0.9553796935511029, 'eval_f1': 0.9517690819625958, 'eval_runtime': 82.9272, 'eval_samples_per_second': 41.796, 'eval_steps_per_second': 2.617, 'epoch': 5.0}


In [30]:
# 12. Save Final Model
trainer.save_model("./bert-ner2")
tokenizer.save_pretrained("./bert-ner-model2")

('./bert-ner-model2/tokenizer_config.json',
 './bert-ner-model2/special_tokens_map.json',
 './bert-ner-model2/vocab.txt',
 './bert-ner-model2/added_tokens.json',
 './bert-ner-model2/tokenizer.json')

In [31]:
!zip -r bert-ner2.zip bert-ner2
!zip -r bert-ner-model2.zip bert-ner-model2


  adding: bert-ner2/ (stored 0%)
  adding: bert-ner2/training_args.bin (deflated 52%)
  adding: bert-ner2/model.safetensors (deflated 7%)
  adding: bert-ner2/vocab.txt (deflated 53%)
  adding: bert-ner2/tokenizer.json (deflated 71%)
  adding: bert-ner2/config.json (deflated 56%)
  adding: bert-ner2/special_tokens_map.json (deflated 42%)
  adding: bert-ner2/tokenizer_config.json (deflated 75%)
  adding: bert-ner-model2/ (stored 0%)
  adding: bert-ner-model2/vocab.txt (deflated 53%)
  adding: bert-ner-model2/tokenizer.json (deflated 71%)
  adding: bert-ner-model2/special_tokens_map.json (deflated 42%)
  adding: bert-ner-model2/tokenizer_config.json (deflated 75%)


In [None]:
from google.colab import files
files.download('bert-ner2.zip')