In [None]:
!pip install -U transformers datasets peft accelerate evaluate jiwer sentencepiece ctranslate2 gradio



In [None]:
# notebooks/02_baseline_inference.ipynb

from src.data_preprocess import create_hf_datasets, prepare_multilingual_data
from src.model_inference import load_model_and_tokenizer, run_inference_on_dataset
from src.evaluation import compute_metrics
import os
from pathlib import Path

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Adjust if your notebook is in a subfolder
PROJECT_ROOT = Path("D:/devegiri_task")           # ← your project path
HF_CACHE_DIR = PROJECT_ROOT / "models" / "hf_cache"

os.environ["HF_HOME"] = str(HF_CACHE_DIR)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DIR / "transformers")

print("All Hugging Face downloads will go to:", HF_CACHE_DIR)

All Hugging Face downloads will go to: D:/devegiri_task/models/hf_cache


In [None]:
# Prepare data
df_train_all, df_valid_all = prepare_multilingual_data()
train_ds, val_ds = create_hf_datasets(df_train_all, df_valid_all)

In [19]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Random eval subset
eval_subset = val_ds.shuffle(seed=42).select(range(50))

# Load model directly using transformers for better control and error handling
# Ensure HF_TOKEN is set in Colab secrets to avoid 401 Unauthorized errors
model_id = "google/mt5-small"

# Check if HF_TOKEN is available in environment variables
hf_token = os.environ.get("HF_TOKEN")

print(f"Loading model and tokenizer for: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=hf_token)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"Model '{model_id}' loaded successfully on device: {device}")

Loading model and tokenizer for: google/mt5-small




Loading weights:   0%|          | 0/192 [00:00<?, ?it/s]



Model 'google/mt5-small' loaded successfully on device: cuda


In [None]:

# Inference
preds, refs = run_inference_on_dataset(model, tokenizer, eval_subset, device, batch_size=32)

# Evaluate
metrics = compute_metrics(preds, refs)
print(metrics)

  0%|          | 0/2 [00:00<?, ?it/s]

{'cer': 1.410989010989011, 'exact_match': np.float64(0.0)}


In [20]:
# After baseline

from src.trainer import run_lora_training

# Small training subset
TRAIN_SIZE = 20000   # adjust down if too slow
train_subset = train_ds.shuffle(seed=42).select(range(TRAIN_SIZE))

# Use the same eval_subset as baseline
print("Starting LoRA training on subset...")
trainer = run_lora_training(
    model=model,
    tokenizer=tokenizer,
    train_subset=train_subset,
    eval_subset=eval_subset,
    output_dir="/content/byt5-lora-colab"
)

Starting LoRA training on subset...
trainable params: 344,064 || all params: 556,635,520 || trainable%: 0.0618


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Starting LoRA fine-tuning...


Step,Training Loss,Validation Loss,Cer,Exact Match
100,128.418936,28.33881,1.389,0.0
200,128.699189,28.299299,1.389,0.0
300,129.855215,28.259947,1.389,0.0
400,128.015342,28.245739,1.389,0.0
500,128.892363,28.203421,1.389,0.0
600,129.160186,28.159716,1.389,0.0
700,127.341289,28.089937,1.389,0.0
800,128.679541,28.074291,1.389,0.0
900,128.919531,28.046047,1.389,0.0
1000,127.981777,28.024702,1.389,0.0


LoRA adapter saved to: /content/byt5-lora-colab/final_adapter


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
!cp -r /content/byt5-lora-colab /content/drive/MyDrive/devegiri_task/

In [None]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(model, "/content/byt5-lora-colab/final_adapter")
merged_model = peft_model.merge_and_unload()

merged_path = "/content/mt5-small-translit-merged"
merged_model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)


In [None]:
# CTranslate2
!ct2-transformers-converter --model {merged_path} --output_dir /content/mt5-small-ctranslate2 --quantization int8 --force

# Copy to Drive
!cp -r /content/byt5-ctranslate2 /content/drive/MyDrive/devegiri_task/