In [None]:
!nvidia-smi

# Processing Dataset
Load dataset from the google drive folder + preprocess it to a suitable format

P.S. It is noteworthy that we escape the `\n` character with `\x80` due to the fact default NLLB tokenizer destroys the `\n` token.  

In [None]:

!pip install -q datasets
!pip install -q sentencepiece
!pip install -q accelerate bitsandbytes transformers[sentencepiece] peft

from typing import *
import re
import csv
from datasets import Dataset, DatasetDict
from pprint import pprint, pformat
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from collections import defaultdict
from pathlib import Path

HOME_DIR = Path("/content/drive/MyDrive/YOUR_HOME_DIR") # Set your home directory here

model_name = "facebook/nllb-200-3.3B"  # Used this when we actuall do things fo real
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="tha_Thai")


from google.colab import drive
drive.mount('/content/drive')

DATASET_CSV_LIST = [
    (HOME_DIR/"data"/"augmented_out_chunkify_filt30_dedup.csv", "text", "text_target"),
    (HOME_DIR/"data"/"machine_out_chunkify_comet_filt60_dedup.csv","text", "text_target")
] # Select dataset to train the model

def print_chunk(chunk_list):
    for chunk in  chunk_list:
        print(chunk)
        print("-----------------")

def  ds_generator():
    for fpath, text_column, target_text_column  in DATASET_CSV_LIST:
        with open(fpath, "r", encoding = "utf-8") as f:
            reader = csv.DictReader(f)
            for item in reader:
                text, text_target = item[text_column], item[target_text_column]
                text = re.sub(r"\s*\n\s*", "\x80", text)
                text_target = re.sub(r"\s*\n\s*", "\x80", text_target)
                yield {"text": text,  "text_target": text_target}

train_ds = Dataset.from_generator(ds_generator)
dataset_dict = train_ds.train_test_split(test_size=0.1, shuffle=True) # train and eval
val_ds = dataset_dict["test"]
train_ds = dataset_dict["train"]

dataset_dict = DatasetDict()
dataset_dict["train"] = train_ds
dataset_dict["val"] = val_ds
dataset_dict.save_to_disk("random_dataset")
dataset_dict = DatasetDict.load_from_disk("random_dataset")
print(dataset_dict)
print(dataset_dict["train"][3:7])


In [None]:
(dataset_dict["train"][1])

# Finetuning NLLB
- The script finetunes the NLLB and with multiple hardcoded parameters
- Change the parameter in the script itself

In [None]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model


print("Base model", model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    # modules_to_save=["lm_head"] # this sepcify also that the head of the Seq2seqalso get trained without lora
)


def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    param_type_to_number = {}
    for param_name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
        param_type_to_number[(param.requires_grad, param.dtype)] = (
            param_type_to_number.get((param.requires_grad, param.dtype), 0)
            + param.numel()
        )

    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    for k, v in param_type_to_number.items():
        print(k, v)


lora_model = get_peft_model(model, lora_config)
print("Lora model:", lora_model)
print_trainable_parameters(lora_model)


training_arguments = Seq2SeqTrainingArguments(
    output_dir="nllb-finetune",
    num_train_epochs=20,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    eval_steps=5000,
    logging_steps=500,
    save_steps=5000,
    label_names=[
        "labels"
    ],  # just so that evaluation cannotice that we have a loss output
    fp16=False,
    # Its note worthy that fp16 training is different from  converting everything to fp16 and train
    # It involves gradient scaling and storing float32 copy of the model
    bf16=True,
    seed=42,
    data_seed=42,
    warmup_ratio=0.1,  # Learning rate are scheduled with traingular
    learning_rate=10e-5,  # this defines the top of the triangle
    per_device_train_batch_size=3,  # adjust the batchsize according to the amount of VRAM you have
    per_device_eval_batch_size=4,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    remove_unused_columns=False,  # prevent keyword removal
    # torch_compile=True, # dont use torch compile  its kinda buggy
)


def DatasetWrapper(x):
    return x


def preprocess(item_list):
    texts = [item["text"] for item in item_list]
    text_targets = [item["text_target"] for item in item_list]
    return tokenizer(texts, text_target=text_targets, padding=True, return_tensors="pt")


trainer = Seq2SeqTrainer(
    lora_model,
    args=training_arguments,
    data_collator=preprocess,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["val"],
)

In [None]:
trainer.label_names

In [None]:
%load_ext tensorboard
%tensorboard --logdir nllb-finetune/runs

In [None]:
trainer.evaluate(dataset_dict["val"])

In [None]:
trainer.train()
# trainer.evaluate(dataset_dict["val"])

In [None]:
# Load the correct checkpoint and save it For the model
!cp -r ./nllb-finetune/checkpoint-50000 ./nllb_finetuned_augmented_filt

#  Test Inference and load code

In [None]:
import torch
from peft import PeftModel

lora_model = PeftModel.from_pretrained(model, HOME_DIR / "nllb_finetuned_mach_2")

print(lora_model)

In [None]:
article = """
An increase in phosphate levels stimulates the release of parathyroid hormone (PTH) due to the body's need to maintain a balance between calcium and phosphate levels in the blood. According to the Thai medical guidelines, the primary function of PTH is to regulate calcium and phosphate homeostasis."""


def raw_translate_en_to_th(text: str, debug=True):
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(torch.device("cuda:0")) for k, v in inputs.items()}
    finetune_translated_tokens = lora_model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["tha_Thai"],
        temperature=0.1,
        do_sample=True,
    )
    finetune_out = str(
        tokenizer.batch_decode(finetune_translated_tokens, skip_special_tokens=True)[0]
    )
    if debug:
        print("Text:", text)
        print("Tokenized:", inputs)
        print("Finetune Translated tokens:", finetune_translated_tokens)
        print("Finetune Translated:", finetune_out)
        print("---------------------------")
    return str(finetune_out)


raw_translate_en_to_th(article)