In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=a994e9e5ee34e667709742896b376eb5ec993327218f61f7cf87a3101f4a6b19
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, ClassLabel, Sequence
from seqeval.metrics import classification_report
from transformers import TrainerCallback, TrainerControl, TrainerState
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from collections import defaultdict

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Configuration - change language as needed
LANGUAGE = "hi"  # Hindi as example - could be 'ta' for Tamil, 'bn' for Bengali etc.
# MODEL_NAME = "bert-base-multilingual-cased"
MODEL_NAME = "prajjwal1/bert-tiny"
OUTPUT_DIR = f"./ner_model_{LANGUAGE}"
BATCH_SIZE = 4
MAX_LENGTH = 128
NUM_EPOCHS = 3

In [None]:
# 1. Load and preprocess Namapadam dataset
def load_namapadam(language=LANGUAGE):
    """Load and preprocess Naamapadam dataset from HF Hub or local path."""
    dataset_name = f"ai4bharat/naamapadam-{language}"
    local_path = f"/content/drive/MyDrive/MTP-1/NammaPadam/{language}_IndicNER_v1.0"

    try:
        # dataset = load_dataset("ai4bharat/naamapadam", language)
        load_dataset(dataset_name)
    except Exception as e:
        print(f"Online load failed: {e}")
        print(f"Falling back to local files at: {local_path}")

        data_files = {
            "train": f"{local_path}/{language}_train.json",
            "validation": f"{local_path}/{language}_val.json",
            "test": f"{local_path}/{language}_test.json"
        }

        dataset = load_dataset("json", data_files=data_files)
        print(dataset["train"].column_names)

    # Namapadam tags
    tag_names = [
        "O",
        "B-PER", "I-PER",
        "B-ORG", "I-ORG",
        "B-LOC", "I-LOC"
    ]


    # Rename 'words' -> 'tokens', 'ner' -> 'ner_tags'
    for split in dataset:
        columns = dataset[split].column_names
        if "tokens" not in columns and "words" in columns:
            dataset[split] = dataset[split].rename_column("words", "tokens")
        if "ner_tags" not in columns and "ner" in columns:
            dataset[split] = dataset[split].rename_column("ner", "ner_tags")

    # Apply label scheme
    new_features = dataset["train"].features.copy()
    new_features["ner_tags"] = Sequence(ClassLabel(names=tag_names))
    dataset = dataset.cast(new_features)

    return dataset

In [None]:
# 2. Load and preprocess WikiAnn dataset
def load_wikiann(language=LANGUAGE):
    """Load and preprocess WikiAnn dataset for specified language"""
    try:
        dataset = load_dataset("wikiann", language)
    except:
        raise ValueError(f"WikiAnn dataset not available for language: {language}")

    # WikiAnn uses different tag names, so we need to map them

    # WikiAnn: 0 = O, 1 = PER, 2 = ORG, 3 = LOC
    # Get tag ID-to-label mapping from dataset metadata
    id2label = dataset["train"].features["ner_tags"].feature.names

    # First convert int tags to string labels
    def map_tags(batch):
        batch["ner_tags"] = [
            [id2label[int(tag)] for tag in ner_seq]  # explicit int() just in case
            for ner_seq in batch["ner_tags"]
        ]
        return batch

    # Then convert to strict IOB2 format
    def convert_to_iob2(batch):
        updated_tags = []
        for ner_seq in batch["ner_tags"]:
            new_seq = []
            prev_type = "O"
            for tag in ner_seq:
                tag = str(tag)
                if tag == "O":
                    new_seq.append("O")
                    prev_type = "O"
                    continue

                if tag in ["PER", "ORG", "LOC"]:
                    entity_type = tag
                    prefix = "B" if prev_type != entity_type else "I"
                    new_seq.append(f"{prefix}-{entity_type}")
                    prev_type = entity_type
                elif "-" in tag:  # Already in IOB format
                    prefix, entity_type = tag.split("-")
                    if entity_type in ["PER", "ORG", "LOC"]:
                        prefix = "B" if prev_type != entity_type else "I"
                        new_seq.append(f"{prefix}-{entity_type}")
                        prev_type = entity_type
                    else:
                        new_seq.append("O")
                        prev_type = "O"
                else:
                    new_seq.append("O")
                    prev_type = "O"

            updated_tags.append(new_seq)
        batch["ner_tags"] = updated_tags
        return batch


    # ✅ Apply to all splits
    for split in dataset.keys():
        dataset[split] = dataset[split].map(map_tags, batched=True)
        dataset[split] = dataset[split].map(convert_to_iob2, batched=True)

    # Restrict tags to standard format
    standard_tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
    features = dataset["train"].features.copy()
    features["ner_tags"] = Sequence(ClassLabel(names=standard_tags))

    for split in dataset.keys():
        dataset[split] = dataset[split].cast(features)

    return dataset

In [None]:
# 3. Tokenize and align labels
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(examples):
    """Tokenize inputs and align labels with subword tokens"""
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token (CLS, SEP, PAD)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] == 0 else -100)  # Only label first token of word
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [None]:
# 4. Load and prepare datasets
print("Loading Namapadam dataset...")
namapadam = load_namapadam()
namapadam = namapadam.map(tokenize_and_align_labels, batched=True)


Loading Namapadam dataset...
Online load failed: Dataset 'ai4bharat/naamapadam-hi' doesn't exist on the Hub or cannot be accessed.
Falling back to local files at: /content/drive/MyDrive/MTP-1/NammaPadam/hi_IndicNER_v1.0


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

['words', 'ner']


Casting the dataset:   0%|          | 0/985787 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13460 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/867 [00:00<?, ? examples/s]

Map:   0%|          | 0/985787 [00:00<?, ? examples/s]

Map:   0%|          | 0/13460 [00:00<?, ? examples/s]

Map:   0%|          | 0/867 [00:00<?, ? examples/s]

In [None]:
# 4. Load and prepare datasets
print("Loading WikiAnn dataset...")
wikiann = load_wikiann()
wikiann = wikiann.map(tokenize_and_align_labels, batched=True)

Loading WikiAnn dataset...


README.md: 0.00B [00:00, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/64.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/65.0k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/312k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
class SaveBestModelCallback(TrainerCallback):
    def __init__(self):
        self.best_metric = -float("inf")

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics, **kwargs):
        if "eval_f1" in metrics and metrics["eval_f1"] > self.best_metric:
            self.best_metric = metrics["eval_f1"]
            print(f"New best model with F1 = {self.best_metric:.4f}. Saving model...")
            kwargs["model"].save_pretrained(args.output_dir + "/best_model")


In [None]:
# 5. Training setup
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(namapadam["train"].features["ner_tags"].feature.names),
    id2label={i: label for i, label in enumerate(namapadam["train"].features["ner_tags"].feature.names)},
    label2id={label: i for i, label in enumerate(namapadam["train"].features["ner_tags"].feature.names)}
).to(device)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5000,
    save_total_limit=1,
    report_to="none",
)


pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 6. Compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [namapadam["train"].features["ner_tags"].feature.names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [namapadam["train"].features["ner_tags"].feature.names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"],
    }


# Use smaller subsets for faster experimentation
small_train_dataset = namapadam["train"].select(range(50000))
small_val_dataset = namapadam["validation"].select(range(5000))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[SaveBestModelCallback()]  # Add this
)

  trainer = Trainer(


In [None]:
# 7. Train the model
print("Training on Namapadam dataset...")
trainer.train()

Training on Namapadam dataset...


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Step,Training Loss
5000,0.3009
10000,0.2574
15000,0.2465
20000,0.2389
25000,0.2351
30000,0.2317
35000,0.2308


TrainOutput(global_step=37500, training_loss=0.2474779443359375, metrics={'train_runtime': 5698.0421, 'train_samples_per_second': 26.325, 'train_steps_per_second': 6.581, 'total_flos': 45815385600000.0, 'train_loss': 0.2474779443359375, 'epoch': 3.0})

In [None]:
# 8. Evaluate on WikiAnn test set
print("Evaluating on WikiAnn test set...")

def evaluate_on_wikiann(model, dataset):
    """Evaluate the model on WikiAnn dataset and return metrics and error analysis"""
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

    model.eval()
    all_predictions = []
    all_labels = []

    for batch in tqdm(dataloader):
        with torch.no_grad():
            inputs = {k: v.to(device) for k, v in batch.items() if k != "ner_tags"}
            outputs = model(**inputs)

        predictions = torch.argmax(outputs.logits, dim=2).cpu().numpy()
        labels = batch["labels"].numpy()

        for i in range(len(predictions)):
            preds = []
            labs = []
            for j in range(len(predictions[i])):
                if labels[i][j] != -100:  # Ignore special tokens
                    preds.append(predictions[i][j])
                    labs.append(labels[i][j])
            all_predictions.append(preds)
            all_labels.append(labs)

    # Convert to tag names
    true_predictions = [
        [namapadam["train"].features["ner_tags"].feature.names[p] for p in pred]
        for pred in all_predictions
    ]
    true_labels = [
        [namapadam["train"].features["ner_tags"].feature.names[l] for l in label]
        for label in all_labels
    ]

    # Get classification report
    report = classification_report(true_labels, true_predictions, output_dict=True)

    # Error analysis - find most common confusions
    confusion_matrix = defaultdict(lambda: defaultdict(int))
    for true_seq, pred_seq in zip(true_labels, true_predictions):
        for t, p in zip(true_seq, pred_seq):
            confusion_matrix[t][p] += 1

    # Find tags with highest error rates
    error_rates = {}
    for true_tag in confusion_matrix:
        total = sum(confusion_matrix[true_tag].values())
        correct = confusion_matrix[true_tag].get(true_tag, 0)
        error_rate = (total - correct) / total
        error_rates[true_tag] = error_rate

    # Sort by error rate
    sorted_errors = sorted(error_rates.items(), key=lambda x: x[1], reverse=True)

    return report, confusion_matrix, sorted_errors

Evaluating on WikiAnn test set...


In [None]:


# Evaluate on WikiAnn test set
wikiann_test = wikiann["test"]
report, confusion_matrix, error_rates = evaluate_on_wikiann(model, wikiann_test)

# 9. Print results
print("\nResults on WikiAnn test set:")
print(f"Precision: {report['weighted avg']['precision']:.3f}")
print(f"Recall: {report['weighted avg']['recall']:.3f}")
print(f"F1-score: {report['weighted avg']['f1-score']:.3f}\n")



  0%|          | 0/250 [00:00<?, ?it/s]


RuntimeError: each element in list of batch should be of equal size

In [None]:
print("Per-class performance:")
for label in report:
    if label not in ["micro avg", "macro avg", "weighted avg"]:
        print(f"{label}:")
        print(f"  Precision: {report[label]['precision']:.3f}")
        print(f"  Recall: {report[label]['recall']:.3f}")
        print(f"  F1: {report[label]['f1-score']:.3f}")



In [None]:
print("\nTop 5 most frequently confused tags:")
for tag, error_rate in error_rates[:5]:
    print(f"{tag}: {error_rate:.1%} error rate")
    print(f"  Most common errors:")
    errors = [(pred, count) for pred, count in confusion_matrix[tag].items() if pred != tag]
    errors.sort(key=lambda x: x[1], reverse=True)
    for pred, count in errors[:3]:
        print(f"    - Predicted as {pred}: {count} times")