# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [2]:
MODEL_NAME = "facebook/nllb-200-distilled-600M"
CSV_PATH = "full_corpus.csv"
OUTPUT_PATH = "./nllb_ro_rromani_lora"

In [1]:
import torch
print(f"Torch version: {torch.__version__}")    # Should be 2.6.0+cu124
print(f"CUDA Available: {torch.cuda.is_available()}") # MUST be True for GPU training

Torch version: 2.6.0+cu124
CUDA Available: True


In [3]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import LoraConfig, get_peft_model


OUTPUT_DIR = "./nllb_ro_rromani_lora"
MAX_LEN = 256

df = pd.read_csv(CSV_PATH)

df["ro"] = df["ro"].astype(str)
df["rmy"] = df["rmy"].astype(str)

df = df[
    df["ro"].notna() &
    df["rmy"].notna() &
    (df["ro"].str.strip() != "") &
    (df["rmy"].str.strip() != "")
]

df = df.reset_index(drop=True)

print("Rows after cleaning:", len(df))

ds = Dataset.from_pandas(df[["ro", "rmy"]])
ds = ds.train_test_split(test_size=0.08, seed=42)  # 92/8 split
train_ds, eval_ds = ds["train"], ds["test"]

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
SRC_LANG = "ron_Latn"
TGT_LANG = "rom_Latn"
tokenizer.src_lang = SRC_LANG
forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)

def preprocess(batch):
    ro_list = batch.get("ro")
    rmy_list = batch.get("rmy")

    # Arrow may give empty batches
    if ro_list is None or rmy_list is None:
        return {}

    cleaned_ro = []
    cleaned_rmy = []

    for ro, rmy in zip(ro_list, rmy_list):
        if ro and rmy:
            ro = str(ro).strip()
            rmy = str(rmy).strip()
            if ro != "" and rmy != "":
                cleaned_ro.append(ro)
                cleaned_rmy.append(rmy)

    # IMPORTANT: skip empty batches
    if len(cleaned_ro) == 0:
        return {}

    model_inputs = tokenizer(
        cleaned_ro,
        truncation=True,
        max_length=MAX_LEN,
    )

    labels = tokenizer(
        cleaned_rmy,
        truncation=True,
        max_length=MAX_LEN,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
eval_tok = eval_ds.map(preprocess, batched=True, remove_columns=eval_ds.column_names)

  from .autonotebook import tqdm as notebook_tqdm


Rows after cleaning: 10379


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 9548/9548 [00:00<00:00, 22617.55 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 831/831 [00:00<00:00, 20072.49 examples/s]


In [4]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)


2.6.0+cu124
True
12.4


In [6]:
import sacrebleu
import torch

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    # use_safetensors=True,
    # low_cpu_mem_usage=False,   # ðŸ”´ CRITICAL
    dtype=torch.float16,
    device_map="auto"
)


# LoRA config for seq2seq (target attention projections)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
)

model = get_peft_model(model, lora_config)

# Ensure correct target language during generation
model.config.forced_bos_token_id = forced_bos_token_id

print(f"Model is running on: {next(model.parameters()).device}")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ---- 6) Metrics ----
def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Replace -100 with pad token for decoding
    labels = [[(t if t != -100 else tokenizer.pad_token_id) for t in seq] for seq in labels]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels]).score
    chrf = sacrebleu.corpus_chrf(decoded_preds, [decoded_labels]).score
    return {"bleu": bleu, "chrf": chrf}

# ---- 7) Training args ----
args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    predict_with_generate=True,
    fp16=True,                       # Enables mixed precision (Crucial for CUDA)
    dataloader_num_workers=2,        # Uses CPU threads to feed the GPU faster
    dataloader_pin_memory=True,      # Speeds up data transfer from CPU to GPU
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Model is running on: cuda:0


Step,Training Loss,Validation Loss,Bleu,Chrf
500,12.4531,12.453125,0.0,0.0


AttributeError: Can't pickle local object 'add_hook_to_module.<locals>.new_forward'

In [15]:
import evaluate
import numpy as np

bleu_metric = evaluate.load("sacrebleu")
chrf_metric = evaluate.load("chrf")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip() for label in labels]] # BLEU expects list of lists

    bleu_results = bleu_metric(predictions=decoded_preds, references=decoded_labels)
    chrf_results = chrf_metric(predictions=decoded_preds, references=decoded_labels)

    return {"bleu": bleu_results, "chrf": chrf_results}


C:\Users\rober\anaconda3\envs\model-fine-tuning\python.exe
2.2.2
12.1
True
NVIDIA GeForce RTX 4070 SUPER
