In [1]:
!pip install -U transformers datasets peft accelerate evaluate jiwer sentencepiece ctranslate2 gradio



In [2]:
# notebooks/02_baseline_inference.ipynb

from src.data_preprocess import create_hf_datasets, prepare_multilingual_data
from src.model_inference import load_model_and_tokenizer, run_inference_on_dataset
from src.evaluation import compute_metrics
import os
from pathlib import Path

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Adjust if your notebook is in a subfolder
PROJECT_ROOT = Path("D:/devegiri_task")           # ← your project path
HF_CACHE_DIR = PROJECT_ROOT / "models" / "hf_cache"

os.environ["HF_HOME"] = str(HF_CACHE_DIR)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DIR / "transformers")

print("All Hugging Face downloads will go to:", HF_CACHE_DIR)

All Hugging Face downloads will go to: D:/devegiri_task/models/hf_cache


In [4]:
# Prepare data
df_train_all, df_valid_all = prepare_multilingual_data()
train_ds, val_ds = create_hf_datasets(df_train_all, df_valid_all)

In [5]:
# Random eval subset
eval_subset = val_ds.shuffle(seed=42).select(range(50))

# Load model
model, tokenizer, device = load_model_and_tokenizer("google/byt5-small")



Loading weights:   0%|          | 0/172 [00:00<?, ?it/s]



In [6]:

# # Inference
# preds, refs = run_inference_on_dataset(model, tokenizer, eval_subset, device, batch_size=32)

# # Evaluate
# metrics = compute_metrics(preds, refs)
# print(metrics)

In [7]:
# After baseline

from src.trainer import run_lora_training

# Small training subset
TRAIN_SIZE = 50000   # adjust down if too slow
train_subset = train_ds.shuffle(seed=42).select(range(TRAIN_SIZE))

# Use the same eval_subset as baseline
print("Starting LoRA training on subset...")
trainer = run_lora_training(
    model=model,
    tokenizer=tokenizer,
    train_subset=train_subset,
    eval_subset=eval_subset,
    output_dir="/content/byt5-lora-colab"
)

Starting LoRA training on subset...
trainable params: 593,920 || all params: 301,362,176 || trainable%: 0.1971


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Starting LoRA fine-tuning...


Step,Training Loss,Validation Loss,Cer,Exact Match
200,21.047251,3.738491,6.7802,0.0
300,20.975581,3.715599,6.7824,0.0
400,20.885701,3.684757,6.7824,0.0
500,20.79291,3.651596,6.7824,0.0
600,20.399258,3.611497,6.7824,0.0
700,20.13142,3.569999,6.7802,0.0
800,19.847338,3.532461,6.778,0.0
900,19.569072,3.496332,6.778,0.0
1000,19.230349,3.462256,6.778,0.0
1100,18.870392,3.432108,6.7758,0.0


KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r /content/byt5-lora-colab /content/drive/MyDrive/devegiri_task/

In [None]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(model, "/content/byt5-lora-colab/final_adapter")
merged_model = peft_model.merge_and_unload()

merged_path = "/content/byt5-translit-merged"
merged_model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)

# CTranslate2
!ct2-transformers-converter --model {merged_path} --output_dir /content/byt5-ctranslate2 --quantization int8 --force

# Copy to Drive
!cp -r /content/byt5-ctranslate2 /content/drive/MyDrive/devegiri_task/