Install necessary dependencies

In [1]:
!pip install --upgrade transformers datasets peft
!pip install transformers datasets seqeval accelerate



Parsing NER Labeled data into Dataset

In [2]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer

# Step 1: Parse CoNLL file into tokens and labels
def parse_conll_file(filepath):
    all_tokens = []
    all_labels = []
    tokens = []
    labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    all_tokens.append(tokens)
                    all_labels.append(labels)
                    tokens = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    token, label = parts
                    tokens.append(token)
                    labels.append(label)

        if tokens:
            all_tokens.append(tokens)
            all_labels.append(labels)

    return all_tokens, all_labels

# Step 2: Load and preprocess
tokens, labels = parse_conll_file("/content/ner_auto_labels.conll")

# Convert tokens (list of tokens) to strings (sentences)
texts = [" ".join(toks) for toks in tokens]

# Create pandas DataFrame
df = pd.DataFrame({
    "text": texts,
    "labels": labels
})

# Step 3: Create Huggingface dataset from pandas dataframe
dataset = Dataset.from_pandas(df)
print(dataset)

# Step 4: Load tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Step 5: Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

# Step 6: Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(tokenized_dataset)

Dataset({
    features: ['text', 'labels'],
    num_rows: 99
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 99
})


In [3]:
tokens, labels = parse_conll_file("/content/ner_auto_labels.conll")
print(f"Sentences parsed: {len(tokens)}")
print("Sample tokens:", tokens[0])
print("Sample labels:", labels[0])

Sentences parsed: 99
Sample tokens: ['በኤሌትሪክ', 'የሚሰራ', 'ምጣድ', 'ለፈጢራ', 'ለቂጣ', 'ለአምባሻና', 'ለተለያዩ', 'ነገሮች', 'መጋገሪያነት', 'የሚያገለግል', 'በተጨማሪም', 'ቡና', 'ለመቁላት', 'የሚመች', 'በአነስተኛ', 'የኤሌትሪክ', 'ሀይል', 'የሚሰራ', 'እጄታዉ', 'የማያቃጥል', 'ማቴሪያል', 'ሶኬቱ', 'ላይ', 'የሀይል', 'መቆጣጠሪያ', 'ፊዉዝ', 'የተገጠመለት', 'አድራሻ', 'መገናኛ', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት', '0909522840', '0923350054', 'ለማዘዝ', 'ይጠቀሙ', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን']
Sample labels: ['B-PRODUCT', 'I-PRODUCT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Parameter-Efficient Fine-Tuning (PEFT)

In [4]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType

# Step 1: Choose your model checkpoint
model_checkpoint = "xlm-roberta-base"  # Or "distilbert-base-uncased" if preferred
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Step 2: Clean labels to avoid whitespace issues
clean_labels = [[label.strip() for label in label_seq] for label_seq in labels]

# Step 3: Create label2id and id2label mappings
unique_labels = sorted(set(label for seq in clean_labels for label in seq))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Step 4: Load base model with label info
base_model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

# Step 5: Configure LoRA for token classification
lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,       # Token classification (NER)
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value", "output.dense"]  # Specific to XLM-Roberta
)

# Step 6: Wrap the base model with PEFT/LoRA
peft_model = get_peft_model(base_model, lora_config)

# Step 7: Confirm trainable parameters
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in peft_model.parameters())
print(f"Trainable params: {trainable_params} | All params: {all_params} | Trainable%: {100 * trainable_params/all_params:.2f}%")


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 963847 | All params: 278422286 | Trainable%: 0.35%


In [5]:
!pip install evaluate



Training Loop Using the Trainer API

In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# Data collator to handle padding dynamically
data_collator = DataCollatorForTokenClassification(tokenizer)

# Compute metrics function for token classification (NER)
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(preds, labels)
    ]

    results = precision_recall_fscore_support(
        [lab for row in true_labels for lab in row],
        [pred for row in true_predictions for pred in row],
        average="weighted"
    )
    return {
        "precision": results[0],
        "recall": results[1],
        "f1": results[2]
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./lora-xlmr-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,  # from your dataset train-test split
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Save best model
trainer.save_model("./lora-xlmr-ner-best")


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [7]:
import transformers
print(transformers.__version__)

4.52.4
