In [1]:
!pip install transformers datasets seqeval sentencepiece





[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# cell 2
import os, json, numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer, DataCollatorForTokenClassification,
    pipeline
)
from evaluate import load as load_metric   # from the 'evaluate' package
# Labels used in our project. Keep consistent with eval_script / gold.json
LABELS = ["O","AADHAAR","PAN","PHONE","EMAIL","PIN","PERSON","ORG","LOC","DATE"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

print("Labels:", LABELS)



  from .autonotebook import tqdm as notebook_tqdm


Labels: ['O', 'AADHAAR', 'PAN', 'PHONE', 'EMAIL', 'PIN', 'PERSON', 'ORG', 'LOC', 'DATE']


In [3]:
# cell 3
# Preferred model (may be gated) and a public fallback. Use fallback to avoid 401 errors.
PREFERRED = "ai4bharat/indic-bert"   # try later if you have access/token
FALLBACK = "xlm-roberta-base"        # public, works for the pipeline test

# Try to load preferred; if not accessible, fallback automatically.
model_name = None
try:
    # Try preferred without token first — will fail on gated repos
    AutoTokenizer.from_pretrained(PREFERRED)
    model_name = PREFERRED
    print("Using preferred model:", PREFERRED)
except Exception as e:
    model_name = FALLBACK
    print("Falling back to public model:", FALLBACK, "\nReason:", str(e))

# set model_name variable used downstream
model_name



Falling back to public model: xlm-roberta-base 
Reason: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/ai4bharat/indic-bert.
401 Client Error. (Request ID: Root=1-6914e5f0-4ef1b0103c47c72d62711387;f5745aa3-c4b9-4462-ae0a-3a7fd61147f6)

Cannot access gated repo for url https://huggingface.co/ai4bharat/indic-bert/resolve/main/config.json.
Access to model ai4bharat/indic-bert is restricted. You must have access to it and be authenticated to access it. Please log in.


'xlm-roberta-base'

In [8]:
# cell 4
# Reads gold.json and rtis/ files, converts to word-level examples.
GOLD_PATH = "gold.json"
RTIS_DIR = "rtis"

def load_gold_as_examples(gold_path=GOLD_PATH, rtis_folder=RTIS_DIR):
    gold = json.load(open(gold_path, encoding='utf-8'))
    examples = []
    for fname, spans in gold.items():
        fpath = os.path.join(rtis_folder, fname)
        if not os.path.exists(fpath):
            print("Warning: missing", fpath)
            continue
        txt = open(fpath, encoding='utf-8').read()
        # simple whitespace split into words (keeps punctuation)
        words = txt.split()
        # compute start/end char of each word
        idxs = []
        cur = 0
        for w in words:
            start = txt.find(w, cur)
            if start == -1:
                # fallback: try to move cursor forward
                start = cur
            idxs.append((start, start+len(w)))
            cur = start + len(w)
        labels = ["O"] * len(words)
        for s in spans:
            s0, s1, lab = s["start"], s["end"], s["label"]
            for i,(ws,we) in enumerate(idxs):
                if not (we <= s0 or ws >= s1):
                    labels[i] = lab
        examples.append({"words": words, "labels": labels, "fname": fname, "text": txt})
    return examples

examples = load_gold_as_examples()
print(f"Loaded {len(examples)} examples")
examples[:1]



Loaded 3 examples


[{'words': ['To,',
   'The',
   'Public',
   'Information',
   'Officer,',
   'Department',
   'of',
   'Transport,',
   'Uttar',
   'Pradesh.',
   'Subject:',
   'Information',
   'under',
   'RTI',
   'Act,',
   '2005',
   'regarding',
   'driving',
   'licence',
   'issuance.',
   'Sir/Madam,',
   'Please',
   'provide',
   'the',
   'number',
   'of',
   'driving',
   'licences',
   'issued',
   'in',
   'Lucknow',
   'district',
   'between',
   'January',
   'and',
   'March',
   '2024.',
   'Also',
   'specify',
   'the',
   'average',
   'processing',
   'time',
   'and',
   'whether',
   'the',
   'department',
   'has',
   'any',
   'online',
   'grievance',
   'redressal',
   'mechanism.',
   'Applicant:',
   'Ritika',
   'Singh',
   'S/o',
   'Ramesh',
   'Kumar',
   'Singh',
   'R/o',
   'C-45,',
   'Gomti',
   'Nagar,',
   'Lucknow',
   '–',
   '226010',
   'Phone:',
   '9936521874',
   'Email:',
   'ritika.singh25@gmail.com',
   'Aadhaar:',
   '4589',
   '1234',
   '7789

In [5]:
# cell 5
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_words(example):
    tokenized = tokenizer(example["words"], is_split_into_words=True, truncation=True, padding="max_length", max_length=512)
    word_ids = tokenized.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id.get(example["labels"][word_idx], 0))
        else:
            # subsequent tokens of the same word get -100
            aligned_labels.append(-100)
        previous_word_idx = word_idx
    tokenized["labels"] = aligned_labels
    tokenized["fname"] = example["fname"]
    return tokenized

# Build tokenized dataset list (small dataset: one sample -> still works)
tokenized_list = [tokenize_and_align_words(ex) for ex in examples]
dataset = Dataset.from_list(tokenized_list)
# if very few examples, use same dataset for train and eval (sanity check)
if len(dataset) > 5:
    dset = dataset.train_test_split(test_size=0.2, seed=42)
    dataset = DatasetDict({"train": dset["train"], "test": dset["test"]})
else:
    dataset = DatasetDict({"train": dataset, "test": dataset})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'fname'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'fname'],
        num_rows: 3
    })
})


In [6]:
# cell 6
metric = load_metric("seqeval")

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    true_labels = []
    true_preds = []
    for p, l in zip(preds, label_ids):
        pl = []
        tl = []
        for pred_i, lab_i in zip(p, l):
            if lab_i != -100:
                pl.append(id2label[pred_i])
                tl.append(id2label[lab_i])
        true_preds.append(pl)
        true_labels.append(tl)
    return true_preds, true_labels

def compute_metrics(eval_pred):
    logits, label_ids = eval_pred
    preds_list, labels_list = align_predictions(logits, label_ids)
    results = metric.compute(predictions=preds_list, references=labels_list)
    overall = {
        "precision": results.get("overall_precision", 0.0),
        "recall": results.get("overall_recall", 0.0),
        "f1": results.get("overall_f1", 0.0)
    }
    return overall

print("Metric ready: seqeval")


Metric ready: seqeval


In [9]:
# sanity: load model variable and move to device
import torch
from transformers import AutoModelForTokenClassification

print("torch:", torch.__version__, "cuda available:", torch.cuda.is_available())

# load model if not present
try:
    model
    print("model already defined")
except NameError:
    print("Loading model:", model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(LABELS),
        id2label=id2label,
        label2id=label2id
    )

# move model to device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Model loaded and moved to", device)


torch: 2.9.1+cpu cuda available: False
Loading model: xlm-roberta-base


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to cpu


In [10]:
# cell 7 (compat for transformers 4.57.1)
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForTokenClassification

# model was loaded earlier as `model`
data_collator = DataCollatorForTokenClassification(tokenizer)

# Use the argument names that match your transformers version (eval_strategy, save_strategy, logging_strategy)
args = TrainingArguments(
    output_dir="rti_model_run",
    eval_strategy="epoch",        # older name in this transformers version
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=10,
)

print("Trainer args ok. dataset sizes -> train:", len(dataset["train"]), " eval:", len(dataset["test"]))
print("Model device:", next(model.parameters()).device)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()




Trainer args ok. dataset sizes -> train: 3  eval: 3
Model device: cpu


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.894982,0.034483,0.090909,0.05


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1, training_loss=2.1222705841064453, metrics={'train_runtime': 452.4232, 'train_samples_per_second': 0.007, 'train_steps_per_second': 0.002, 'total_flos': 783946967040.0, 'train_loss': 2.1222705841064453, 'epoch': 1.0})

In [12]:
# cell 8
OUTDIR = "trained_rti_model"
trainer.save_model(OUTDIR)
tokenizer.save_pretrained(OUTDIR)
print("Saved model to", OUTDIR)


Saved model to trained_rti_model


In [13]:
# cell 9
# Build a token-classification pipeline using the saved model
pipe = pipeline("token-classification", model=OUTDIR, tokenizer=OUTDIR, aggregation_strategy="simple")

rtis_files = [ex["fname"] for ex in examples]  # only the examples you loaded
all_preds = {}

for fname in rtis_files:
    text = open(os.path.join(RTIS_DIR, fname), encoding='utf-8').read()
    preds = pipe(text)
    spans = []
    # convert pipeline outputs to {start,end,label}
    for p in preds:
        # p has keys: entity_group (label), score, word, start, end
        lbl = p.get("entity_group", p.get("entity", None))
        if lbl is None:
            continue
        spans.append({"start": int(p["start"]), "end": int(p["end"]), "label": str(lbl)})
    all_preds[fname] = spans

# Save to preds_finetuned.json
with open("preds_finetuned.json", "w", encoding="utf-8") as f:
    json.dump(all_preds, f, ensure_ascii=False, indent=2)

print("Wrote preds_finetuned.json with entries for", len(all_preds))
all_preds


Device set to use cpu


Wrote preds_finetuned.json with entries for 3


{'sample1.txt': [{'start': 0, 'end': 46, 'label': 'DATE'},
  {'start': 50, 'end': 60, 'label': 'DATE'},
  {'start': 67, 'end': 75, 'label': 'DATE'},
  {'start': 84, 'end': 107, 'label': 'DATE'},
  {'start': 111, 'end': 143, 'label': 'DATE'},
  {'start': 148, 'end': 165, 'label': 'DATE'},
  {'start': 173, 'end': 211, 'label': 'DATE'},
  {'start': 217, 'end': 218, 'label': 'DATE'},
  {'start': 225, 'end': 384, 'label': 'DATE'},
  {'start': 389, 'end': 396, 'label': 'DATE'},
  {'start': 399, 'end': 432, 'label': 'DATE'},
  {'start': 434, 'end': 445, 'label': 'DATE'},
  {'start': 446, 'end': 455, 'label': 'DATE'},
  {'start': 459, 'end': 546, 'label': 'DATE'}],
 'sample2.txt': [{'start': 0, 'end': 46, 'label': 'DATE'},
  {'start': 50, 'end': 79, 'label': 'DATE'},
  {'start': 83, 'end': 89, 'label': 'DATE'},
  {'start': 98, 'end': 99, 'label': 'DATE'},
  {'start': 102, 'end': 139, 'label': 'DATE'},
  {'start': 148, 'end': 167, 'label': 'DATE'},
  {'start': 178, 'end': 185, 'label': 'DATE'},

In [14]:
# cell 10 (optional)
import subprocess, sys
print("Evaluating baseline preds.json (if exists) and finetuned preds_finetuned.json")
if os.path.exists("preds.json"):
    print("Baseline:")
    subprocess.run([sys.executable, "eval_script.py", "gold.json", "preds.json"])
print("\nFine-tuned:")
subprocess.run([sys.executable, "eval_script.py", "gold.json", "preds_finetuned.json"])


Evaluating baseline preds.json (if exists) and finetuned preds_finetuned.json
Baseline:

Fine-tuned:


CompletedProcess(args=['c:\\Users\\aasth\\RTI-Redaction-BTP\\venv\\Scripts\\python.exe', 'eval_script.py', 'gold.json', 'preds_finetuned.json'], returncode=0)