# Setup



In [2]:
import os
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, T5ForConditionalGeneration
import torch
import gc
import accelerate

import logging
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
from huggingface_hub import login

models_path = r"../Models"
code_path = r"../Code"
data_path = r"../Dataset"

In [3]:
file = (os.path.join(code_path,r'HF_key.tex'))
with open(file) as f:
    lines = f.readlines()
    f.close()
    
HF_key = lines[0].split("= ")[1]

try:
    login(token=HF_key)
    print("Successfully logged in to Hugging Face Hub.")
except Exception as e:
    print(f"Failed to login to Hugging Face Hub: {e}")

Successfully logged in to Hugging Face Hub.


# Custom Function



In [4]:
def clear_pipeline(pipe, verbosity=0):
    """Clears a Hugging Face pipeline and frees CUDA memory."""
    if hasattr(pipe, "model") and next(pipe.model.parameters()).is_cuda:
        initial_allocated = torch.cuda.memory_allocated() / 1e6
        initial_reserved = torch.cuda.memory_reserved() / 1e6

        if verbosity > 0:
            print(f"🔍 Before unloading: {initial_allocated:.2f} MB allocated, {initial_reserved:.2f} MB reserved.")

        try:
            pipe.model.to("cpu")
            for param in pipe.model.parameters():
                param.data = param.data.cpu()
        except Exception as e:
            if verbosity > 0:
                print(f"⚠️ Error moving model to CPU: {e}")

        del pipe.model
        del pipe
        gc.collect()
        torch.cuda.empty_cache()

        final_allocated = torch.cuda.memory_allocated() / 1e6
        final_reserved = torch.cuda.memory_reserved() / 1e6

        if verbosity > 0:
            print(f"✅ Pipeline cleared. Freed {initial_allocated - final_allocated:.2f} MB allocated, "
                  f"{initial_reserved - final_reserved:.2f} MB reserved.")
    else:
        if verbosity > 0:
            print("ℹ️ Pipeline already on CPU. Performing standard cleanup.")
        del pipe
        gc.collect()

    if verbosity > 0:
        print("🗑️ Cleanup complete.")
    elif verbosity == 0:
        print("✅ Pipeline cleared.")

# Text to be translated



In [5]:
hm_text = "Koj puas mob taub hau?"
en_text = "Do you have a headache?"

# Original t5\-base\-en2vi vs. Fine\-Tuned Models



In [39]:
checkpoint = "trungnguyentran/t5-base-en2vi"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

inputs = ['Koj puas mob taub hau?']

outputs = model.generate(tokenizer(inputs, return_tensors="pt", padding=True).input_ids, max_length=512)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Koj puas mob taub hau?']

In [42]:
checkpoint = "trungnguyentran/t5-base-en2vi"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

inputs = ['Do you have a headache?']

outputs = model.generate(tokenizer(inputs, return_tensors="pt", padding=True).input_ids, max_length=512)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Bạn có bị đau đầu không?']

In [43]:
translation_pipeline = pipeline("translation", model="trungnguyentran/t5-base-en2vi")
translation_result = translation_pipeline(hm_text, max_length=300)
print(translation_result[0]['translation_text'])

clear_pipeline(translation_pipeline)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Koj puas mob taub hau?
✅ Pipeline cleared.


## Fine\-tuned



In [47]:
checkpoint = "shpie/t5-base-en2vi-finetuned-Hmong-to-English"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

inputs = ['Koj puas mob taub hau?']

outputs = model.generate(tokenizer(inputs, return_tensors="pt", padding=True).input_ids, max_length=512)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['sv taub hau']

In [45]:
translation_pipeline = pipeline("translation", model="shpie/t5-base-en2vi-finetuned-Hmong-to-English")
translation_result = translation_pipeline(hm_text, max_length=300)
print(translation_result[0]['translation_text'])

clear_pipeline(translation_pipeline)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


sv taub hau
✅ Pipeline cleared.


In [48]:
checkpoint = "shpie/t5-base-en2vi-finetuned-English-to-Hmong"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

inputs = ['Do you have a headache?']

outputs = model.generate(tokenizer(inputs, return_tensors="pt", padding=True).input_ids, max_length=512)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['o you you a teevneeg']

In [49]:
translation_pipeline = pipeline("translation", model="shpie/t5-base-en2vi-finetuned-English-to-Hmong")
translation_result = translation_pipeline(en_text, max_length=300)
print(translation_result[0]['translation_text'])

clear_pipeline(translation_pipeline)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


o you you a teevneeg
✅ Pipeline cleared.


# Original ByT5\-small vs. Fine\-Tuned Models



In [5]:
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')

input_ids = torch.tensor([list("I like to eat food.".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("Kuv nyiam noj mov.".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(5.8959, grad_fn=<NllLossBackward0>)


In [7]:
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')

input_ids = torch.tensor([list("Do you have a headache?".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("Koj puas mob taub hau?".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(5.0459, grad_fn=<NllLossBackward0>)


In [6]:
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')

input_ids = torch.tensor([list("Kuv nyiam noj mov.".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("I like to eat food.".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(4.4951, grad_fn=<NllLossBackward0>)


In [8]:
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')

input_ids = torch.tensor([list("Koj puas mob taub hau?".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("Do you have a headache?".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(4.0100, grad_fn=<NllLossBackward0>)


## Fine\-Tuned



In [52]:
model = T5ForConditionalGeneration.from_pretrained('shpie/byt5-small-finetuned-Hmong-to-English')

input_ids = torch.tensor([list("Kuv nyiam noj mov.".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("I like to eat food.".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(4.4951, grad_fn=<NllLossBackward0>)


In [9]:
model = T5ForConditionalGeneration.from_pretrained('shpie/byt5-small-finetuned-Hmong-to-English')

input_ids = torch.tensor([list("Koj puas mob taub hau?".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("Do you have a headache?".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(4.0100, grad_fn=<NllLossBackward0>)


In [6]:
model = T5ForConditionalGeneration.from_pretrained('shpie/byt5-small-finetuned-Hmong-to-English')

input_ids = torch.tensor([list("I like to eat food.".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("Kuv nyiam noj mov.".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(5.8959, grad_fn=<NllLossBackward0>)


In [50]:
translation_pipeline = pipeline("translation", model="shpie/byt5-small-finetuned-Hmong-to-English", tokenizer=AutoTokenizer.from_pretrained("google/byt5-small"))
translation_result = translation_pipeline(hm_text, max_length=300)
print(translation_result[0]['translation_text'])

clear_pipeline(translation_pipeline)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


 taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
Taub hau?
 tau
✅ Pipeline cleared.


In [7]:
model = T5ForConditionalGeneration.from_pretrained('shpie/byt5-small-finetuned-English-to-Hmong')

input_ids = torch.tensor([list("I like to eat food.".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("Kuv nyiam noj mov.".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(5.8959, grad_fn=<NllLossBackward0>)


In [11]:
model = T5ForConditionalGeneration.from_pretrained('shpie/byt5-small-finetuned-English-to-Hmong')

input_ids = torch.tensor([list("Do you have a headache?".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("Koj puas mob taub hau?".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(5.0459, grad_fn=<NllLossBackward0>)


In [8]:
model = T5ForConditionalGeneration.from_pretrained('shpie/byt5-small-finetuned-English-to-Hmong')

input_ids = torch.tensor([list("Kuv nyiam noj mov.".encode("utf-8"))]) + 3  # add 3 for special tokens
labels = torch.tensor([list("I like to eat food.".encode("utf-8"))]) + 3  # add 3 for special tokens

loss = model(input_ids, labels=labels).loss # forward pass

print(loss)

tensor(4.4951, grad_fn=<NllLossBackward0>)


In [51]:
translation_pipeline = pipeline("translation", model="shpie/byt5-small-finetuned-English-to-Hmong", tokenizer=AutoTokenizer.from_pretrained("google/byt5-small"))
translation_result = translation_pipeline(en_text, max_length=300)
print(translation_result[0]['translation_text'])

clear_pipeline(translation_pipeline)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


ave a headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
Headache?
✅ Pipeline cleared.
