In [1]:
!pip install transformers datasets seqeval sentencepiece




In [2]:
# cell 2: Labels (BIO)
BASE_LABELS = ["AADHAAR","PAN","PHONE","EMAIL","PIN","PERSON","ORG","LOC","DATE"]
# build BIO label set
BIO_LABELS = ["O"] + [f"B-{l}" for l in BASE_LABELS] + [f"I-{l}" for l in BASE_LABELS]
label2id = {l:i for i,l in enumerate(BIO_LABELS)}
id2label = {i:l for l,i in label2id.items()}

print("BIO labels count:", len(BIO_LABELS))
print("Example labels:", BIO_LABELS[:8])



BIO labels count: 19
Example labels: ['O', 'B-AADHAAR', 'B-PAN', 'B-PHONE', 'B-EMAIL', 'B-PIN', 'B-PERSON', 'B-ORG']


In [3]:
# cell 3
# Preferred model (may be gated) and a public fallback. Use fallback to avoid 401 errors.
PREFERRED = "ai4bharat/indic-bert"   # try later if you have access/token
FALLBACK = "xlm-roberta-base"        # public, works for the pipeline test

# Try to load preferred; if not accessible, fallback automatically.
model_name = None
try:
    # Try preferred without token first â€” will fail on gated repos
    AutoTokenizer.from_pretrained(PREFERRED)
    model_name = PREFERRED
    print("Using preferred model:", PREFERRED)
except Exception as e:
    model_name = FALLBACK
    print("Falling back to public model:", FALLBACK, "\nReason:", str(e))

# set model_name variable used downstream
model_name



Falling back to public model: xlm-roberta-base 
Reason: name 'AutoTokenizer' is not defined


'xlm-roberta-base'

In [4]:
# cell 4: load gold -> word tokens + BIO labels
import os, json, re

GOLD_PATH = "gold.json"
RTIS_DIR = "rtis"

def compute_word_spans(text, words):
    # compute start/end char index for each word occurrence using a cursor
    idxs = []
    cur = 0
    for w in words:
        start = text.find(w, cur)
        if start == -1:
            # fallback: advance cursor until we find plausible match (rare)
            start = cur
        idxs.append((start, start + len(w)))
        cur = start + len(w)
    return idxs

def span_label_tokens(word_idxs, spans):
    # spans: list of dicts with start,end,label
    labels = ["O"] * len(word_idxs)
    for s in spans:
        s0, s1, lab = s["start"], s["end"], s["label"]
        # clamp label name mapping
        lab = lab.upper()
        for i,(ws,we) in enumerate(word_idxs):
            # if word intersects span
            if not (we <= s0 or ws >= s1):
                labels[i] = lab
    # convert to BIO: contiguous tokens of same label -> B- / I-
    bio = []
    prev = "O"
    for lab in labels:
        if lab == "O":
            bio.append("O")
            prev = "O"
        else:
            if prev != lab:
                bio.append(f"B-{lab}")
            else:
                bio.append(f"I-{lab}")
            prev = lab
    return bio

def load_gold_as_examples(gold_path=GOLD_PATH, rtis_folder=RTIS_DIR):
    gold = json.load(open(gold_path, encoding='utf-8'))
    examples = []
    for fname, spans in gold.items():
        fpath = os.path.join(rtis_folder, fname)
        if not os.path.exists(fpath):
            print("Warning: missing", fpath)
            continue
        txt = open(fpath, encoding='utf-8').read()
        # simple whitespace split into words (keeps punctuation attached to words)
        words = txt.split()
        word_idxs = compute_word_spans(txt, words)
        labels = span_label_tokens(word_idxs, spans)
        examples.append({"words": words, "labels": labels, "fname": fname, "text": txt, "word_idxs": word_idxs})
    return examples

examples = load_gold_as_examples()
print(f"Loaded {len(examples)} examples")
examples[:1]


Loaded 50 examples


[{'words': ['To,',
   'The',
   'Public',
   'Information',
   'Officer,',
   'Department',
   'of',
   'Transport,',
   'Uttar',
   'Pradesh.',
   'Subject:',
   'Information',
   'under',
   'RTI',
   'Act,',
   '2005',
   'regarding',
   'driving',
   'licence',
   'issuance.',
   'Sir/Madam,',
   'Please',
   'provide',
   'the',
   'number',
   'of',
   'driving',
   'licences',
   'issued',
   'in',
   'Lucknow',
   'district',
   'between',
   'January',
   'and',
   'March',
   '2024.',
   'Also',
   'specify',
   'the',
   'average',
   'processing',
   'time',
   'and',
   'whether',
   'the',
   'department',
   'has',
   'any',
   'online',
   'grievance',
   'redressal',
   'mechanism.',
   'Applicant:',
   'Ritika',
   'Singh',
   'S/o',
   'Ramesh',
   'Kumar',
   'Singh',
   'R/o',
   'C-45,',
   'Gomti',
   'Nagar,',
   'Lucknow',
   '-',
   '226010',
   'Phone:',
   '9936521874',
   'Email:',
   'ritika.singh25@gmail.com',
   'Aadhaar:',
   '4589',
   '1234',
   '7789

In [5]:
# cell 5: tokenizer + align labels to tokens
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def tokenize_and_align_words(example, max_length=512):
    # example: {"words": [...], "labels": [...], ...}
    tokenized = tokenizer(example["words"], is_split_into_words=True,
                          truncation=True, padding="max_length", max_length=max_length, return_tensors=None)
    word_ids = tokenized.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            # label for the first token of the word
            lab = example["labels"][word_idx]
            aligned_labels.append(label2id.get(lab, label2id["O"]))
        else:
            # subsequent subtokens
            aligned_labels.append(-100)
        previous_word_idx = word_idx
    tokenized["labels"] = aligned_labels
    tokenized["fname"] = example["fname"]
    return tokenized

# Build dataset list (tokenized)
tokenized_list = [tokenize_and_align_words(ex) for ex in examples]
from datasets import Dataset, DatasetDict
dataset = Dataset.from_list(tokenized_list)
if len(dataset) > 5:
    dset = dataset.train_test_split(test_size=0.2, seed=42)
    dataset = DatasetDict({"train": dset["train"], "test": dset["test"]})
else:
    dataset = DatasetDict({"train": dataset, "test": dataset})

print(dataset)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'fname'],
        num_rows: 40
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'fname'],
        num_rows: 10
    })
})


In [6]:
# cell 6: metric & model loading
from evaluate import load as load_metric
metric = load_metric("seqeval")

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    true_labels = []
    true_preds = []
    for p, l in zip(preds, label_ids):
        pl = []
        tl = []
        for pred_i, lab_i in zip(p, l):
            if lab_i != -100:
                pl.append(id2label[pred_i])
                tl.append(id2label[lab_i])
        true_preds.append(pl)
        true_labels.append(tl)
    return true_preds, true_labels

def compute_metrics(eval_pred):
    logits, label_ids = eval_pred
    preds_list, labels_list = align_predictions(logits, label_ids)
    # seqeval expects BIO labels like B-PERSON etc.
    results = metric.compute(predictions=preds_list, references=labels_list)
    overall = {
        "precision": results.get("overall_precision", 0.0),
        "recall": results.get("overall_recall", 0.0),
        "f1": results.get("overall_f1", 0.0)
    }
    return overall

print("Metric ready: seqeval")

import torch
from transformers import AutoModelForTokenClassification

print("torch:", torch.__version__, "cuda available:", torch.cuda.is_available())

# load model (if already loaded in session skip)
try:
    model
    print("model already defined")
except NameError:
    print("Loading model:", model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(BIO_LABELS),
        id2label=id2label,
        label2id=label2id
    )

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Model loaded and moved to", device)


Metric ready: seqeval
torch: 2.9.1+cpu cuda available: False
Loading model: xlm-roberta-base


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to cpu


In [None]:
# cell 7: training arguments + trainer
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    output_dir="rti_model_run",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,          # increased
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1"   # depends on compute_metrics returning f1; Trainer expects that key
)

print("Trainer args ok. dataset sizes -> train:", len(dataset["train"]), " eval:", len(dataset["test"]))
print("Model device:", next(model.parameters()).device)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Trainer args ok. dataset sizes -> train: 40  eval: 10
Model device: cpu




Epoch,Training Loss,Validation Loss


In [None]:
# cell 7 (compat for transformers 4.57.1)
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForTokenClassification

# model was loaded earlier as `model`
data_collator = DataCollatorForTokenClassification(tokenizer)

# Use the argument names that match your transformers version (eval_strategy, save_strategy, logging_strategy)
args = TrainingArguments(
    output_dir="rti_model_run",
    eval_strategy="epoch",        # older name in this transformers version
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=10,
)

print("Trainer args ok. dataset sizes -> train:", len(dataset["train"]), " eval:", len(dataset["test"]))
print("Model device:", next(model.parameters()).device)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()




Trainer args ok. dataset sizes -> train: 3  eval: 3
Model device: cpu


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.894982,0.034483,0.090909,0.05


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1, training_loss=2.1222705841064453, metrics={'train_runtime': 452.4232, 'train_samples_per_second': 0.007, 'train_steps_per_second': 0.002, 'total_flos': 783946967040.0, 'train_loss': 2.1222705841064453, 'epoch': 1.0})

In [None]:
# cell 8
OUTDIR = "trained_rti_model"
trainer.save_model(OUTDIR)
tokenizer.save_pretrained(OUTDIR)
print("Saved model to", OUTDIR)


Saved model to trained_rti_model


In [None]:
# hf inference to preds JSON (run after saving model)
from transformers import pipeline
import unicodedata
from pathlib import Path
import json, torch

MODEL_DIR = "trained_rti_model"
device = 0 if torch.cuda.is_available() else -1

nlp_pipe = pipeline(
    "token-classification",
    model=MODEL_DIR,
    tokenizer=MODEL_DIR,
    aggregation_strategy="simple",
    device=device
)

def normalize_text(s):
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200c","").replace("\u200d","").replace("\ufeff","")
    s = s.replace("\r\n","\n").replace("\r","\n")
    return s

preds = {}
for p in sorted(Path("rtis").glob("*.txt")):
    raw = p.read_text(encoding="utf-8", errors="replace")
    text = normalize_text(raw)
    ents = nlp_pipe(text)
    spans = []
    for e in ents:
        lab = e.get("entity_group", e.get("label"))
        # normalize label to base label names
        if lab is None:
            continue
        if lab.startswith("B-") or lab.startswith("I-"):
            lab = lab.split("-",1)[1]
        lab = lab.upper()
        # map LOC/GPE to ADDRESS if needed
        if lab in ("LOC","GPE"):
            lab = "ADDRESS"
        spans.append({"start": int(e["start"]), "end": int(e["end"]), "label": lab, "text": text[int(e["start"]):int(e["end"])]})
    preds[p.name] = spans

Path("preds_from_xlmr.json").write_text(json.dumps(preds, ensure_ascii=False, indent=2), encoding="utf-8")
print("Wrote preds_from_xlmr.json")


Device set to use cpu


Wrote preds_finetuned.json with entries for 3


{'sample1.txt': [{'start': 0, 'end': 46, 'label': 'DATE'},
  {'start': 50, 'end': 60, 'label': 'DATE'},
  {'start': 67, 'end': 75, 'label': 'DATE'},
  {'start': 84, 'end': 107, 'label': 'DATE'},
  {'start': 111, 'end': 143, 'label': 'DATE'},
  {'start': 148, 'end': 165, 'label': 'DATE'},
  {'start': 173, 'end': 211, 'label': 'DATE'},
  {'start': 217, 'end': 218, 'label': 'DATE'},
  {'start': 225, 'end': 384, 'label': 'DATE'},
  {'start': 389, 'end': 396, 'label': 'DATE'},
  {'start': 399, 'end': 432, 'label': 'DATE'},
  {'start': 434, 'end': 445, 'label': 'DATE'},
  {'start': 446, 'end': 455, 'label': 'DATE'},
  {'start': 459, 'end': 546, 'label': 'DATE'}],
 'sample2.txt': [{'start': 0, 'end': 46, 'label': 'DATE'},
  {'start': 50, 'end': 79, 'label': 'DATE'},
  {'start': 83, 'end': 89, 'label': 'DATE'},
  {'start': 98, 'end': 99, 'label': 'DATE'},
  {'start': 102, 'end': 139, 'label': 'DATE'},
  {'start': 148, 'end': 167, 'label': 'DATE'},
  {'start': 178, 'end': 185, 'label': 'DATE'},