In [None]:
! pip install transformers datasets torch evaluate seqeval scikit-learn accelerate indic-transliteration tqdm sentencepiece


In [None]:
import json
from pathlib import Path
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import evaluate
from tqdm import tqdm
import gc
import torch

# Install dependencies
try:
    from indic_transliteration import sanscript
    from indic_transliteration.sanscript import transliterate
except ImportError:
    import subprocess
    subprocess.check_call(['pip', 'install', 'indic-transliteration', '-q'])
    from indic_transliteration import sanscript
    from indic_transliteration.sanscript import transliterate

# ============================================================
# PATH CONFIGURATION - UPDATE THIS FOR YOUR LOCAL MACHINE
# ============================================================

# OPTION 1: If cross_lingual_data is in same directory as script
data_dir = Path("cross_lingual_data")

# OPTION 2: If you're on Google Colab
# data_dir = Path("/content/cross_lingual_data")

# OPTION 3: Absolute path (Windows example)
# data_dir = Path("C:/Users/YourName/Desktop/BTP/cross_lingual_data")

# OPTION 4: Absolute path (Linux/Mac example)
# data_dir = Path("/home/samarth/Documents/BTP/cross_lingual_data")

print("="*70)
print(" CONFIGURATION")
print("="*70)
print(f"Current working directory: {Path.cwd()}")
print(f"Data directory: {data_dir.absolute()}")
print()

# Verify directory exists
if not data_dir.exists():
    print(f" ERROR: Directory not found!")
    print(f"   Looking for: {data_dir.absolute()}")
    print(f"\n Fix:")
    print(f"   1. Update 'data_dir' variable above")
    print(f"   2. Make sure cross_lingual_data folder exists")
    print(f"   3. Check that it contains: as_data.json, bn_data.json, etc.")
    exit(1)

# List files in directory
print(f" Files in {data_dir.name}:")
for file in sorted(data_dir.glob("*.json")):
    size_mb = file.stat().st_size / (1024*1024)
    print(f"   - {file.name} ({size_mb:.2f} MB)")
print()

# ============================================================
# SCRIPT MAPPING
# ============================================================

SCRIPT_MAP = {
    'as': sanscript.BENGALI,
    'bn': sanscript.BENGALI,
    'gu': sanscript.GUJARATI,
    'ml': sanscript.MALAYALAM,
    'mr': sanscript.DEVANAGARI,
    'ta': sanscript.TAMIL,
    'te': sanscript.TELUGU,
}

def transliterate_to_devanagari(text, lang_code):
    if lang_code == 'mr':
        return text
    source_script = SCRIPT_MAP[lang_code]
    try:
        return transliterate(text, source_script, sanscript.DEVANAGARI)
    except:
        return text

# ============================================================
# LOAD DATA
# ============================================================

def load_and_convert_data(file_path, lang_code):
    """Load and convert JSON data."""

    data = []

    # Count lines
    with open(file_path, 'r', encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)

    # Load
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=total_lines, desc=f"  {lang_code.upper()}", ncols=80):
            try:
                item = json.loads(line.strip())
                devanagari_tokens = [
                    transliterate_to_devanagari(word, lang_code)
                    for word in item['words']
                ]
                data.append({
                    'tokens': devanagari_tokens,
                    'ner_tags': item['ner']
                })
            except:
                continue

    return data

# Load all languages
print("="*70)
print(" LOADING DATA")
print("="*70)

languages = ['as', 'bn', 'gu', 'ml', 'mr', 'ta', 'te']
all_data = []

for lang in languages:
    file_path = data_dir / f"{lang}_data.json"

    print(f"\n {lang.upper()}: {file_path.name}")

    if file_path.exists():
        lang_data = load_and_convert_data(file_path, lang)
        print(f"   ✓ {len(lang_data):,} examples")

        if lang_data:
            sample = lang_data[0]
            print(f"   Devanagari: {' '.join(sample['tokens'][:3])}...")

        all_data.extend(lang_data)
        del lang_data
        gc.collect()
    else:
        print(f"    NOT FOUND: {file_path.absolute()}")

print(f"\n Total: {len(all_data):,} examples")
print("="*70)

# ============================================================
# SPLIT DATA
# ============================================================

train_val, test = train_test_split(all_data, test_size=0.10, random_state=42)
train, val = train_test_split(train_val, test_size=0.10, random_state=42)

print(f"\n Splits:")
print(f"  Train: {len(train):,}")
print(f"  Val:   {len(val):,}")
print(f"  Test:  {len(test):,}")

del all_data
gc.collect()

dataset = DatasetDict({
    'train': Dataset.from_list(train),
    'validation': Dataset.from_list(val),
    'test': Dataset.from_list(test)
})

# ============================================================
# LABELS & MODEL
# ============================================================

all_labels = set()
for item in train[:1000]:
    all_labels.update(item['ner_tags'])

label_list = sorted(list(all_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

print(f"\n Labels: {label_list}")

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# ============================================================
# TOKENIZE
# ============================================================

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=False,
        max_length=512
    )

    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label_seq[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

print("\n Tokenizing...")
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=1000,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing"
)

# ============================================================
# METRICS
# ============================================================

seqeval = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

    for prediction, label in zip(predictions, labels):
        pred_labels = []
        true_label_list = []

        for pred_id, label_id in zip(prediction, label):
            if label_id != -100:
                pred_labels.append(id2label[pred_id])
                true_label_list.append(id2label[label_id])

        true_predictions.append(pred_labels)
        true_labels.append(true_label_list)

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
    }

# ============================================================
# TRAINING CONFIGURATION WITH CHECKPOINTING
# ============================================================

print("\n  Training configuration...")

# Create checkpoint directory
checkpoint_dir = "./checkpoints"
Path(checkpoint_dir).mkdir(exist_ok=True)

training_args = TrainingArguments(
    # Output and checkpointing
    output_dir=checkpoint_dir,
    overwrite_output_dir=True,

    # Checkpoint strategy
    save_strategy="steps",
    save_steps=1000,                      # Save every 1000 steps
    save_total_limit=3,                   # Keep only best 3 checkpoints

    # Evaluation
    eval_strategy="steps",
    eval_steps=1000,                      # Evaluate every 1000 steps

    # Model selection
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    # Training hyperparameters
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,

    # Optimization
    gradient_accumulation_steps=2,        # Effective batch = 32
    fp16=torch.cuda.is_available(),       # Mixed precision if GPU
    dataloader_num_workers=4,

    # Logging
    logging_dir="./logs",
    logging_steps=100,
    logging_strategy="steps",

    # Misc
    seed=42,
    push_to_hub=False,
    report_to="none",
    resume_from_checkpoint=None           # Auto-resume if checkpoint exists
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print(f"  Configured!")
print(f"  Checkpoints: {checkpoint_dir}")
print(f"  Save every: {training_args.save_steps} steps")
print(f"  Keep best: {training_args.save_total_limit} checkpoints")

# ============================================================
# TRAINING WITH AUTO-RESUME
# ============================================================

print("\n" + "="*70)
print("TRAINING")
print("="*70)

# Check for existing checkpoints
existing_checkpoints = list(Path(checkpoint_dir).glob("checkpoint-*"))
if existing_checkpoints:
    print(f" Found {len(existing_checkpoints)} existing checkpoints")
    print(f"   Training will resume from last checkpoint")

try:
    trainer.train(resume_from_checkpoint=True if existing_checkpoints else None)
    print("\nTraining completed!")
except KeyboardInterrupt:
    print("\n Training interrupted! Checkpoints saved.")
    print(f"   Resume with: trainer.train(resume_from_checkpoint=True)")

# ============================================================
# SAVE FINAL MODEL
# ============================================================

output_dir = "./xlm-roberta-devanagari-ner-final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"\nFinal model: {output_dir}")

# ============================================================
# EVALUATE
# ============================================================

print("\nEvaluating...")
test_results = trainer.evaluate(tokenized_dataset["test"])

print("\n" + "="*70)
print("TEST RESULTS")
print("="*70)
print(f"Precision: {test_results['eval_precision']:.4f}")
print(f"Recall:    {test_results['eval_recall']:.4f}")
print(f"F1 Score:  {test_results['eval_f1']:.4f}")
print("="*70)

# ============================================================
# CLEANUP CHECKPOINTS (OPTIONAL)
# ============================================================

print("\nCleanup options:")
print("  Keep checkpoints: Do nothing")
print("  Delete checkpoints: Run 'rm -rf ./checkpoints'")

print("\nDONE!")


 CONFIGURATION
Current working directory: /content
Data directory: /content/cross_lingual_data

 Files in cross_lingual_data:
   - as_data.json (0.95 MB)

 LOADING DATA

 AS: as_data.json


  AS: 100%|██████████████████████████████| 3767/3767 [00:00<00:00, 10262.41it/s]


   ✓ 3,766 examples
   Devanagari: प्ৰान्तीय় ৰिलिफ...

 BN: bn_data.json
    NOT FOUND: /content/cross_lingual_data/bn_data.json

 GU: gu_data.json
    NOT FOUND: /content/cross_lingual_data/gu_data.json

 ML: ml_data.json
    NOT FOUND: /content/cross_lingual_data/ml_data.json

 MR: mr_data.json
    NOT FOUND: /content/cross_lingual_data/mr_data.json

 TA: ta_data.json
    NOT FOUND: /content/cross_lingual_data/ta_data.json

 TE: te_data.json
    NOT FOUND: /content/cross_lingual_data/te_data.json

 Total: 3,766 examples

 Splits:
  Train: 3,050
  Val:   339
  Test:  377

 Labels: ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Tokenizing...


Tokenizing:   0%|          | 0/3050 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/339 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/377 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]


  Training configuration...
  Configured!
  Checkpoints: ./checkpoints
  Save every: 1000 steps
  Keep best: 3 checkpoints

TRAINING


  trainer = Trainer(



 Training interrupted! Checkpoints saved.
   Resume with: trainer.train(resume_from_checkpoint=True)

Final model: ./xlm-roberta-devanagari-ner-final

Evaluating...


KeyboardInterrupt: 

In [None]:
# ============================================================
# SAMPLE PREDICTION
# ============================================================

print("\n" + "="*70)
print("SAMPLE PREDICTION (Transliteration and Model Test)")
print("="*70)

# --- 1. Test Transliteration ---
sample_ml_text = "আজিৰ পৰা ৰিলায়েন্স ইণ্ডাষ্ট্ৰীজৰ নতুন চিইঅ' হ'ল মুকেশ অম্বানী।"
lang_code = 'as' # Malayalam

devanagari_text = transliterate_to_devanagari(sample_ml_text, lang_code)
print(f"Original ({lang_code.upper()}): {sample_ml_text}")
print(f"Transliterated (Devanagari): {devanagari_text}")
print("---")


# --- 2. Test Model Prediction ---

# NOTE: This prediction will use the model trained (or loaded) just above.
# If the script stopped before training finished, the performance will be poor.

test_tokens = devanagari_text.split()
print(f"Tokens for prediction: {test_tokens}")

# Tokenize the input
tokenized_input = tokenizer(
    test_tokens,
    is_split_into_words=True,
    return_tensors="pt"
)

# Put model in evaluation mode and run prediction
model.eval()
with torch.no_grad():
    # Pass inputs to the model
    outputs = model(**tokenized_input)

# Get the predicted token IDs
predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()

# Get word IDs to map back to original tokens
word_ids = tokenized_input.word_ids()

# Map token IDs back to labels (I-PER, B-ORG, etc.)
final_predictions = []
current_word_idx = None
print("\nPrediction Results:")
for token_id, pred_id, word_idx in zip(tokenized_input.input_ids.squeeze().tolist(), predictions, word_ids):
    # Only consider prediction if it corresponds to the start of a new original word
    if word_idx is not None and word_idx != current_word_idx:
        token = tokenizer.convert_ids_to_tokens(token_id)
        label = id2label[pred_id]
        final_predictions.append((token, label))
    current_word_idx = word_idx

# Display aligned results
prev_word_idx = None
print(f"{'Token':<15} {'Predicted Label':<15}")
print(f"{'-'*15:<15} {'-'*15:<15}")

for i, (token, label) in enumerate(final_predictions):
    if token.startswith(' '): # XLM-R uses ' ' to mark the start of a word-piece
        token = token.lstrip(' ')
    elif token.startswith('</s>') or token.startswith('<s>'):
        continue

    print(f"{token:<15} {label:<15}")

print("="*70)


SAMPLE PREDICTION (Transliteration and Model Test)
Original (AS): আজিৰ পৰা ৰিলায়েন্স ইণ্ডাষ্ট্ৰীজৰ নতুন চিইঅ' হ'ল মুকেশ অম্বানী।
Transliterated (Devanagari): आजिৰ पৰा ৰिलाय়ेन्स इण्डाष्ट्ৰीजৰ नतुन चिइअ' ह'ल मुकेश अम्वानी।
---
Tokens for prediction: ['आजिৰ', 'पৰा', 'ৰिलाय়ेन्स', 'इण्डाष्ट्ৰीजৰ', 'नतुन', "चिइअ'", "ह'ल", 'मुकेश', 'अम्वानी।']

Prediction Results:
Token           Predicted Label
--------------- ---------------
▁आज             B-ORG          
▁प              B-ORG          
▁               B-ORG          
▁इ              B-ORG          
▁न              B-ORG          
▁चि             B-ORG          
▁ह              B-ORG          
▁मु             B-ORG          
▁अ              B-ORG          
