In [1]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from datasets import load_dataset
from seqeval.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

In [3]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

In [4]:
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(batch["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 3250/3250 [00:00<00:00, 11655.47 examples/s]


In [5]:
def collate_fn(batch):
    input_ids = [torch.tensor(x["input_ids"]) for x in batch]
    attention_mask = [torch.tensor(x["attention_mask"]) for x in batch]
    labels = [torch.tensor(x["labels"]) for x in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids": input_ids_padded,
        "attention_mask": attention_mask_padded,
        "labels": labels_padded,
    }

train_loader = DataLoader(tokenized_datasets["train"], batch_size=64, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_datasets["validation"], batch_size=64, collate_fn=collate_fn)

In [6]:
class RoBERTaForNER(nn.Module):
    def __init__(self, pretrained_model_name, num_labels, dropout_rate=0.3):
        super(RoBERTaForNER, self).__init__()
        self.roberta = AutoModel.from_pretrained(pretrained_model_name, output_hidden_states=True)
        self.num_hidden_layers = self.roberta.config.num_hidden_layers + 1
        self.layer_weights = nn.Parameter(torch.ones(self.num_hidden_layers))
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = torch.stack(outputs.hidden_states, dim=0)
        weighted_hidden_states = torch.sum(self.layer_weights[:, None, None, None] * hidden_states, dim=0)
        sequence_output = self.dropout(weighted_hidden_states)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, logits.size(-1))[active_loss]
            active_labels = labels.view(-1)[active_loss]
            loss = loss_fn(active_logits, active_labels)

        return {"loss": loss, "logits": logits}

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RoBERTaForNER(pretrained_model_name="roberta-base", num_labels=num_labels, dropout_rate=0.3).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader)}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 220/220 [12:09<00:00,  3.32s/it]

Epoch 1 Loss: 0.4667081406501397





In [9]:
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=-1)

        for pred, label in zip(preds.cpu().numpy(), labels.cpu().numpy()):
            predictions.append([label_list[p] for p, l in zip(pred, label) if l != -100])
            true_labels.append([label_list[l] for p, l in zip(pred, label) if l != -100])

# Compute classification report
print(classification_report(true_labels, predictions))

Evaluating: 100%|██████████| 51/51 [00:33<00:00,  1.51it/s]


              precision    recall  f1-score   support

         LOC       0.88      0.95      0.91      3323
        MISC       0.71      0.74      0.72      1355
         ORG       0.86      0.88      0.87      2672
         PER       0.96      0.95      0.96      3238

   micro avg       0.88      0.91      0.89     10588
   macro avg       0.85      0.88      0.87     10588
weighted avg       0.88      0.91      0.89     10588



In [11]:
def predict_ner_from_dataset(dataset, index, model, tokenizer, label_list):
    """
    Predict NER tags for a sentence from the dataset using the trained model.

    Args:
        dataset (Dataset): The tokenized dataset to pick an example from.
        index (int): Index of the sentence to use from the dataset.
        model (torch.nn.Module): Trained NER model.
        tokenizer (AutoTokenizer): Tokenizer used for tokenizing the sentence.
        label_list (list): List of label names corresponding to NER tags.

    Returns:
        dict: Contains original sentence, tokenized input, and predicted labels.
    """
    # Extract the sentence and true labels from the dataset
    sentence_tokens = dataset["tokens"][index]
    true_labels = dataset["ner_tags"][index]

    # Tokenize the input sentence
    tokenized_inputs = tokenizer(
        sentence_tokens, truncation=True, return_tensors="pt", is_split_into_words=True, add_special_tokens=True
    )
    input_ids = tokenized_inputs["input_ids"].to(device)
    attention_mask = tokenized_inputs["attention_mask"].to(device)

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        predictions = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()

    # Decode tokens and predicted labels
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().cpu().numpy())
    predicted_labels = [label_list[pred] for pred in predictions]
    true_labels_decoded = [label_list[label] for label in true_labels]

    return {
        "sentence": " ".join(sentence_tokens),
        "tokens": tokens,
        "predicted_labels": predicted_labels,
        "true_labels": true_labels_decoded,
    }


# Initialize RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
model = RoBERTaForNER(pretrained_model_name="roberta-base", num_labels=num_labels, dropout_rate=0.3).to(device)

# Example: Pick a sample sentence from the validation dataset
example_index = 0  # Replace with any valid index
result = predict_ner_from_dataset(dataset["validation"], example_index, model, tokenizer, label_list)

# Display the results
print("Original Sentence:")
print(result["sentence"])
print("\nTokenized Input:")
print(result["tokens"])
print("\nPredicted Labels:")
print(result["predicted_labels"])
print("\nTrue Labels:")
print(result["true_labels"])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Original Sentence:
CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY .

Tokenized Input:
['<s>', 'ĠCR', 'ICK', 'ET', 'Ġ-', 'ĠLE', 'IC', 'EST', 'ERS', 'HI', 'RE', 'ĠTA', 'KE', 'ĠOVER', 'ĠAT', 'ĠTOP', 'ĠAFTER', 'ĠIN', 'NING', 'S', 'ĠV', 'ICT', 'ORY', 'Ġ.', '</s>']

Predicted Labels:
['O', 'O', 'I-ORG', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'O', 'O']

True Labels:
['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
