In [None]:
! pip install transformers datasets torch seqeval

# Load and Prepare the Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("eriktks/conll2003", trust_remote_code=True)

print(dataset)


# Load a Pre-trained Tokenizer


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


#  Load the Pre-trained BERT Model for Token Classification

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased", num_labels=num_labels
)


# Create DataLoaders

In [7]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
train_dataloader = DataLoader(tokenized_datasets["train"], collate_fn=data_collator, batch_size=16, shuffle=True)
val_dataloader = DataLoader(tokenized_datasets["validation"], collate_fn=data_collator, batch_size=16)


In [None]:
# small_train_dataset = tokenized_datasets["train"].select(range(100))
# small_val_dataset = tokenized_datasets["validation"].select(range(100))

# Define Training Arguments and Trainer

In [None]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import classification_report

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch"
)

from seqeval.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support


def compute_metrics(p):
    predictions = p.predictions.argmax(axis=-1)  
    true_labels = p.label_ids  
    
    mask = true_labels != -100
    predictions = predictions[mask]
    true_labels = true_labels[mask]
    
    predictions = [idx2tag[idx] for idx in predictions]
    true_labels = [idx2tag[idx] for idx in true_labels]
    
    report = classification_report([true_labels], [predictions], output_dict=True)
    
    metrics = {f"{k}_precision": v["precision"] for k, v in report.items() if k != "accuracy"}
    metrics.update({f"{k}_recall": v["recall"] for k, v in report.items() if k != "accuracy"})
    metrics.update({f"{k}_f1": v["f1-score"] for k, v in report.items() if k != "accuracy"})
    
    if "accuracy" in report:
        metrics["accuracy"] = report["accuracy"]
    
    return metrics

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()



# Evaluate the Model

In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
print(results)


# Make Predictions on New Text

In [None]:
import torch

def predict(text, model, tokenizer):
    tokens = tokenizer(text.split(), is_split_into_words=True, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    tokenized_words = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
    labels = [label_list[label] for label in predictions]
    return list(zip(tokenized_words, labels))

text = "Barack Obama was born in Hawaii."
print(predict(text, model, tokenizer))
