In [1]:
!pip install -U transformers datasets peft accelerate evaluate jiwer sentencepiece ctranslate2 gradio

Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting ctranslate2
  Downloading ctranslate2-4.7.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting gradio
  Downloading gradio-6.5.1-py3-none-any.whl.metadata (16 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting gradio-client==2.0.3 (from gradio)
  Downloading gradio_client-2.0.3-py3-none-any.whl.metadata (7.1 kB)
Downloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2

In [2]:
# notebooks/02_baseline_inference.ipynb

from src.data_preprocess import create_hf_datasets, prepare_multilingual_data
from src.model_inference import load_model_and_tokenizer, run_inference_on_dataset
from src.evaluation import compute_metrics
import os
from pathlib import Path

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

In [3]:
# Adjust if your notebook is in a subfolder
PROJECT_ROOT = Path("D:/devegiri_task")           # ← your project path
HF_CACHE_DIR = PROJECT_ROOT / "models" / "hf_cache"

os.environ["HF_HOME"] = str(HF_CACHE_DIR)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DIR / "transformers")

print("All Hugging Face downloads will go to:", HF_CACHE_DIR)

All Hugging Face downloads will go to: D:/devegiri_task/models/hf_cache


In [4]:
# Prepare data
df_train_all, df_valid_all = prepare_multilingual_data()
train_ds, val_ds = create_hf_datasets(df_train_all, df_valid_all)

In [None]:
# Random eval subset
eval_subset = val_ds.shuffle(seed=42).select(range(500))

# Load model
model, tokenizer, device = load_model_and_tokenizer("google/mt5-small")

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/172 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:

# Inference
preds, refs = run_inference_on_dataset(model, tokenizer, eval_subset, device, batch_size=32)

# Evaluate
metrics = compute_metrics(preds, refs)
print(metrics)

In [None]:
# After baseline (in same notebook)

from src.trainer import run_lora_training

# Small training subset
TRAIN_SIZE = 50000   # adjust down if too slow
train_subset = train_ds.shuffle(seed=42).select(range(TRAIN_SIZE))

# Use the same eval_subset as baseline
print("Starting LoRA training on subset...")
trainer = run_lora_training(
    model=model,
    tokenizer=tokenizer,
    train_subset=train_subset,
    eval_subset=eval_subset,   # same 500 as baseline
    output_dir="/content/byt5-lora-colab"
)

Starting LoRA training on subset...
trainable params: 593,920 || all params: 301,362,176 || trainable%: 0.1971




Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Starting LoRA fine-tuning...


Step,Training Loss,Validation Loss


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r /content/byt5-lora-colab /content/drive/MyDrive/devegiri_task/

In [None]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(model, "/content/byt5-lora-colab/final_adapter")
merged_model = peft_model.merge_and_unload()

merged_path = "/content/byt5-translit-merged"
merged_model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)

# CTranslate2
!ct2-transformers-converter --model {merged_path} --output_dir /content/byt5-ctranslate2 --quantization int8 --force

# Copy to Drive
!cp -r /content/byt5-ctranslate2 /content/drive/MyDrive/devegiri_task/