In [6]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from datasets import load_dataset
from seqeval.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

In [7]:
# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

Downloading data: 100%|██████████| 983k/983k [00:00<00:00, 6.38MB/s]
Generating train split: 100%|██████████| 14041/14041 [00:00<00:00, 17835.96 examples/s]
Generating validation split: 100%|██████████| 3250/3250 [00:00<00:00, 15533.96 examples/s]
Generating test split: 100%|██████████| 3453/3453 [00:00<00:00, 18662.63 examples/s]


In [10]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

In [11]:
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(batch["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map: 100%|██████████| 14041/14041 [00:00<00:00, 23245.80 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 22190.50 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 21251.43 examples/s]


In [12]:
def collate_fn(batch):
    input_ids = [torch.tensor(x["input_ids"]) for x in batch]
    attention_mask = [torch.tensor(x["attention_mask"]) for x in batch]
    labels = [torch.tensor(x["labels"]) for x in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids": input_ids_padded,
        "attention_mask": attention_mask_padded,
        "labels": labels_padded,
    }

train_loader = DataLoader(tokenized_datasets["train"], batch_size=64, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_datasets["validation"], batch_size=64, collate_fn=collate_fn)

In [13]:
class RoBERTaForNER(nn.Module):
    def __init__(self, pretrained_model_name, num_labels, dropout_rate=0.3):
        super(RoBERTaForNER, self).__init__()
        self.roberta = AutoModel.from_pretrained(pretrained_model_name, output_hidden_states=True)
        self.num_hidden_layers = self.roberta.config.num_hidden_layers + 1
        self.layer_weights = nn.Parameter(torch.ones(self.num_hidden_layers))
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = torch.stack(outputs.hidden_states, dim=0)
        weighted_hidden_states = torch.sum(self.layer_weights[:, None, None, None] * hidden_states, dim=0)
        sequence_output = self.dropout(weighted_hidden_states)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, logits.size(-1))[active_loss]
            active_labels = labels.view(-1)[active_loss]
            loss = loss_fn(active_logits, active_labels)

        return {"loss": loss, "logits": logits}

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RoBERTaForNER(pretrained_model_name="roberta-base", num_labels=num_labels, dropout_rate=0.3).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader)}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 220/220 [12:19<00:00,  3.36s/it]


Epoch 1 Loss: 0.6306283933533864


Epoch 2: 100%|██████████| 220/220 [12:04<00:00,  3.29s/it]

Epoch 2 Loss: 0.1495687378078318





In [16]:
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=-1)

        for pred, label in zip(preds.cpu().numpy(), labels.cpu().numpy()):
            predictions.append([label_list[p] for p, l in zip(pred, label) if l != -100])
            true_labels.append([label_list[l] for p, l in zip(pred, label) if l != -100])

# Compute classification report
print(classification_report(true_labels, predictions))

Evaluating: 100%|██████████| 51/51 [00:41<00:00,  1.23it/s]


              precision    recall  f1-score   support

         LOC       0.96      0.95      0.95      3323
        MISC       0.79      0.84      0.81      1355
         ORG       0.91      0.90      0.91      2672
         PER       0.94      0.97      0.96      3238

   micro avg       0.92      0.93      0.92     10588
   macro avg       0.90      0.92      0.91     10588
weighted avg       0.92      0.93      0.93     10588

