# Setup



In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys

import torch
from torch import nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import evaluate
metric=evaluate.load("sacrebleu")

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer


import logging
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
from huggingface_hub import login

models_path = r"../Models"
code_path = r"../Code"
data_path = r"../Dataset"

In [5]:
file = (os.path.join(code_path,r'HF_key.tex'))
with open(file) as f:
    lines = f.readlines()
    f.close()
    
HF_key = lines[0].split("= ")[1]

try:
    login(token=HF_key)
    print("Successfully logged in to Hugging Face Hub.")
except Exception as e:
    print(f"Failed to login to Hugging Face Hub: {e}")

Successfully logged in to Hugging Face Hub.


# Load Dataset



In [6]:
dataset = load_dataset("shpie/Hmong-to-Eng-4k")

# Tokenize



In [7]:
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [5]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

source_lang = "Hmong"
target_lang = "English"
max_target_length=128
max_input_length=128

def preprocess_function(examples):
    inputs = [ex for ex in examples[source_lang]]
    targets = [ex for ex in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['Hmong', 'English'],
)

In [6]:
#Testing tokenizer on inputs
h_sentence = dataset["train"]["Hmong"][1]
eng_sentence = dataset["train"]["English"][1]

inputs = tokenizer(h_sentence, text_target=eng_sentence)
inputs

{'input_ids': [92, 120, 100, 121, 35, 119, 118, 120, 112, 35, 118, 100, 120, 35, 113, 119, 100, 122, 121, 35, 116, 107, 108, 100, 35, 102, 114, 121, 35, 113, 104, 104, 106, 35, 112, 114, 101, 35, 119, 123, 114, 106, 35, 110, 104, 121, 35, 113, 115, 100, 109, 35, 117, 107, 114, 35, 119, 100, 122, 112, 35, 111, 114, 118, 35, 118, 108, 118, 35, 107, 111, 114, 114, 121, 35, 102, 107, 100, 122, 35, 119, 107, 108, 100, 101, 35, 118, 104, 101, 35, 121, 108, 112, 35, 111, 108, 35, 102, 100, 118, 119, 35, 118, 108, 118, 35, 115, 120, 101, 35, 103, 107, 100, 120, 35, 111, 108, 35, 54, 51, 35, 107, 113, 120, 101, 35, 120, 100, 35, 113, 119, 104, 109, 35, 124, 120, 100, 121, 35, 117, 107, 114, 35, 119, 100, 122, 112, 35, 113, 119, 100, 122, 112, 35, 111, 120, 101, 35, 119, 118, 104, 121, 35, 119, 120, 35, 113, 104, 104, 106, 35, 111, 100, 120, 118, 35, 111, 114, 118, 35, 118, 108, 118, 35, 119, 118, 104, 121, 35, 113, 124, 114, 101, 35, 112, 120, 100, 109, 35, 110, 104, 121, 35, 119, 120, 35, 119,

In [7]:
output = tokenizer.decode(inputs['input_ids'])
output

'Yuav tsum sau ntawv qhia cov neeg mob txog kev npaj rho tawm los sis hloov chaw thiab seb vim li cast sis pub dhau li 30 hnub ua ntej yuav rho tawm ntawm lub tsev tu neeg laus los sis tsev nyob muaj kev tu thiab xya hnub ua ntej yuav hloov mus rau lwm lub chav nyob nyob rau lub tsev tu neeg laus los sis tsev nyob muaj kev tu</s>'

# Model 3



In [8]:
#Call for the model
model_checkpoint = "google/byt5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [10]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode token IDs to text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 (used to mask loss calculation) with padding token ID for decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean up spacing and format references properly
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    # Track average length of generated sequences
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    # Round values for readability
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [11]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    remove_unused_columns=True,
    save_total_limit=3,
    gradient_accumulation_steps=6,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [12]:
m3_results = trainer.train()

  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())


Epoch,Training Loss,Validation Loss


  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())


In [13]:
trainer.evaluate()

{'eval_loss': nan,
 'eval_bleu': 0.0222,
 'eval_gen_len': 19.0,
 'eval_runtime': 39.4921,
 'eval_samples_per_second': 24.537,
 'eval_steps_per_second': 1.545,
 'epoch': 0.9917355371900827}

# Model 4



In [14]:
source_lang = "English"
target_lang = "Hmong"
max_target_length=128
max_input_length=128

def preprocess_function(examples):
    inputs = [ex for ex in examples[source_lang]]
    targets = [ex for ex in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['Hmong', 'English'],
)

In [15]:
#Testing tokenizer on inputs
h_sentence = dataset["train"]["Hmong"][1]
eng_sentence = dataset["train"]["English"][1]

inputs = tokenizer(eng_sentence, text_target=h_sentence)
inputs

output = tokenizer.decode(inputs['input_ids'])
output

'Residents must be notified, in writing, of the proposed discharge or transfer and its justification no later than 30 days before discharge from the nursing or boarding care home and seven days before transfer to another room within the nursing or boarding care home</s>'

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    remove_unused_columns=True,
    save_total_limit=3,
    gradient_accumulation_steps=6,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
m4_results = trainer.train()
trainer.evaluate()

  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())


Epoch,Training Loss,Validation Loss


  return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())


{'eval_loss': nan,
 'eval_bleu': 0.0203,
 'eval_gen_len': 19.0,
 'eval_runtime': 41.4497,
 'eval_samples_per_second': 23.378,
 'eval_steps_per_second': 1.472,
 'epoch': 0.9917355371900827}