In [None]:
# notebooks/02_baseline_inference.ipynb

from src.data_preprocess import create_hf_datasets, prepare_multilingual_data
from src.model_inference import load_model_and_tokenizer, run_inference_on_dataset
from src.evaluation import compute_metrics
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Adjust if your notebook is in a subfolder
PROJECT_ROOT = Path("D:/devegiri_task")           # ← your project path
HF_CACHE_DIR = PROJECT_ROOT / "models" / "hf_cache"

os.environ["HF_HOME"] = str(HF_CACHE_DIR)
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DIR / "transformers")

print("All Hugging Face downloads will go to:", HF_CACHE_DIR)

All Hugging Face downloads will go to: D:\devegiri_task\models\hf_cache


In [None]:
# Prepare data
df_train_all, df_valid_all = prepare_multilingual_data()
train_ds, val_ds = create_hf_datasets(df_train_all, df_valid_all)

In [None]:
# Random eval subset
eval_subset = val_ds.shuffle(seed=42).select(range(500))

# Load model
model, tokenizer, device = load_model_and_tokenizer("google/mt5-small")

Loading weights: 100%|██████████| 172/172 [00:00<00:00, 572.79it/s, Materializing param=shared.weight]                                                      


In [None]:

# Inference
preds, refs = run_inference_on_dataset(model, tokenizer, eval_subset, device, batch_size=32)

# Evaluate
metrics = compute_metrics(preds, refs)
print(metrics)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 172/172 [00:00<00:00, 923.76it/s, Materializing param=shared.weight]                                                      
100%|██████████| 16/16 [12:54<00:00, 48.38s/it]

{'cer': 7.23841376015289, 'exact_match': np.float64(0.0)}





In [None]:
# After baseline (in same notebook)

from src.trainer import run_lora_training

# Small training subset
TRAIN_SIZE = 5000   # adjust down if too slow
train_subset = train_ds.shuffle(seed=42).select(range(TRAIN_SIZE))

# Use the same eval_subset as baseline
print("Starting LoRA training on subset...")
trainer = run_lora_training(
    model=model,
    tokenizer=tokenizer,
    train_subset=train_subset,
    eval_subset=eval_subset,   # same 500 as baseline
    output_dir="./byt5-lora-local"
)

Starting LoRA training on subset...
trainable params: 593,920 || all params: 301,362,176 || trainable%: 0.1971


Map: 100%|██████████| 5000/5000 [00:11<00:00, 438.63 examples/s]
Map: 100%|██████████| 500/500 [00:01<00:00, 476.07 examples/s]


Starting LoRA fine-tuning...


  super().__init__(loader)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 