# Reference

This code is roughly similar to the [hugging face translation tutorial](https://huggingface.co/docs/transformers/en/tasks/translation) 

In [None]:
!pip install evaluate sacrebleu -q

In [None]:
import torch
import json
from tqdm.notebook import trange, tqdm
from torch.utils.data import Dataset, DataLoader
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelWithLMHead
from transformers import T5ForConditionalGeneration, T5TokenizerFast, GenerationConfig

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd "/content/drive/MyDrive/Graduate School/NLP/Code"
!ls

In [None]:
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2, seed=42)
books["train"][0:3]["translation"]

```py
from transformers import AutoTokenizer, AutoModelWithLMHead

checkpoint_3b = "google-t5/t5-3b"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_3b)
model = AutoModelWithLMHead.from_pretrained(checkpoint_3b)
```

Or we can use t5-large which is a 880M Pararmeter Model:
https://huggingface.co/google-t5/t5-large

```py
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint_large = "google-t5/t5-large"
tokenizer = AutoTokenizer.from_pretrained("checkpoint_large")
model = AutoModelForSeq2SeqLM.from_pretrained("checkpoint_large")
```

In [None]:
model_name = "small"
model_type = None

if model_name.lower() == "small":
  checkpoint = "google-t5/t5-small"
  model_type = AutoModelForSeq2SeqLM
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
elif model_name.lower() == "large":
  checkpoint = "google-t5/t5-large"
  model_type = AutoModelForSeq2SeqLM
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
elif model_name.lower() == "3b":
  checkpoint = "google-t5/t5-3b"
  model_type = AutoModelWithLMHead
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def preprocess_function_en_to_fr(examples):
    source_lang = "en"
    target_lang = "fr"
    prefix = "translate English to French: "

    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

def preprocess_function_fr_to_en(examples):
    source_lang = "fr"
    target_lang = "en"
    prefix = "translate French to English: "

    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
tokenized_books_en_to_fr = books.map(preprocess_function_en_to_fr, batched=True)
tokenized_books_fr_to_en = books.map(preprocess_function_fr_to_en, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds < 0, tokenizer.pad_token_id, preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# model = model_type.from_pretrained(checkpoint)

In [None]:
def finetune_model(local_model, name: str, num_epochs: int, tokenized_dataset, max_len=32, train_batch=16, eval_batch=16, learning_rate = 2e-5):
    local_training_args = Seq2SeqTrainingArguments(
        output_dir=name,
        eval_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch,
        per_device_eval_batch_size=eval_batch,
        generation_max_length=max_len, # can reduce this for faster training, but worse preformance (min = 20)
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    local_trainer = Seq2SeqTrainer(
        model=local_model,
        args=local_training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    local_trainer.train()

# Training en to fr base

In [None]:
if model_name.lower() == "small":
  checkpoint = "google-t5/t5-small"
elif model_name.lower() == "large":
  checkpoint = "google-t5/t5-large"
elif model_name.lower() == "3b":
  checkpoint = "google-t5/t5-3b"

model_en_to_fr = model_type.from_pretrained(checkpoint).to(device)

W&B Token: 34e6eba1f5945844382dc10a5314475fa664ad3b


In [None]:
finetune_model(model_en_to_fr, f"{model_name}_en_to_fr_base", 2, tokenized_books_en_to_fr, train_batch=32, eval_batch=32)

# Training fr to en base

In [None]:
if model_name.lower() == "small":
  checkpoint = "google-t5/t5-small"
elif model_name.lower() == "large":
  checkpoint = "google-t5/t5-large"
elif model_name.lower() == "3b":
  checkpoint = "google-t5/t5-3b"

model_fr_to_en = model_type.from_pretrained(checkpoint)

In [None]:
finetune_model(model_fr_to_en, f"{model_type}_fr_to_en_base", 1, tokenized_books_fr_to_en)

#  Recursive training

### Loading in Models

In [None]:
en_to_fr_checkpoint = "checkpoint-6355"

model_en_to_fr = T5ForConditionalGeneration.from_pretrained(f"t5_en_to_fr/{en_to_fr_checkpoint}/").to(device)

fr_to_en_checkpoint = "checkpoint-19065"

model_fr_to_en = T5ForConditionalGeneration.from_pretrained(f"t5_fr_to_en_final/{fr_to_en_checkpoint}/").to(device)

### Create Smaller Test and Train Datasets

In [None]:
np.random.seed(42)

test_dataset_len = tokenized_books_en_to_fr["test"].num_rows
test_smaller_size = 2500

test_dataset_indicies = np.arange(test_dataset_len)
np.random.shuffle(test_dataset_indicies)

start_indicies = test_dataset_indicies[:test_smaller_size]

reduced_tokenized_test = tokenized_books_en_to_fr["test"].select(start_indicies)

# need to do this to prevent random errors ¯\_(ツ)_/¯
test_tokenized_dataset = reduced_tokenized_test.remove_columns(books["test"].column_names)
complete_test_dataset = tokenized_books_en_to_fr["test"].remove_columns(books["test"].column_names)

In [None]:
np.random.seed(84)

train_dataset_len = tokenized_books_en_to_fr["train"].num_rows
train_smaller_size = 10000 # 10,000

train_dataset_indicies = np.arange(train_dataset_len)
np.random.shuffle(train_dataset_indicies)

start_indicies = train_dataset_indicies[:train_smaller_size]

reduced_tokenized_train = tokenized_books_en_to_fr["train"].select(start_indicies)

# need to do this to prevent random errors ¯\_(ツ)_/¯
train_tokenized_dataset = reduced_tokenized_train.remove_columns(books["train"].column_names)
complete_train_dataset = tokenized_books_en_to_fr["train"].remove_columns(books["train"].column_names)

### One Recursive Train Loop

In [None]:
def get_recursive_data(train_dataset, model_forward = model_en_to_fr, model_reverse = model_fr_to_en, batch_size = 512):
    torch.cuda.empty_cache()

    dataset_list = []
    english_sentences = []
    french_sentences = []

    loader = DataLoader(train_dataset, batch_size = batch_size, collate_fn = data_collator)

    for i, tokenized_inputs in tqdm(enumerate(loader), total=len(loader), desc = "Forward (EN to FR) Inference Pass"):
        with torch.no_grad():
            generated_outputs = model_forward.generate(
                input_ids=tokenized_inputs['input_ids'].to(device),
                attention_mask=tokenized_inputs['attention_mask'].to(device),
                max_length=128,  # Maximum length for generation
                num_return_sequences=1  # Number of sequences to return per input
            ).cpu()

        decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
        french_sentences.extend(decoded)
        decoded = list(map(lambda s: "translate French to English: " + s, decoded))

        encoded = tokenizer(decoded, return_tensors="pt", max_length=128, padding=True, truncation=True)
        dataset_list.append(Dataset.from_dict(encoded))

        del generated_outputs, tokenized_inputs
        torch.cuda.empty_cache()

    reverse_dataset = concatenate_datasets(dataset_list)

    loader2 = DataLoader(reverse_dataset, batch_size = batch_size, collate_fn = data_collator)

    for i, tokenized_inputs in tqdm(enumerate(loader2), total=len(loader2), desc = "Reverse (FR to EN) Inference Pass"):
        with torch.no_grad():
            generated_outputs = model_reverse.generate(
                input_ids=tokenized_inputs['input_ids'].to(device),
                attention_mask=tokenized_inputs['attention_mask'].to(device),
                max_length=128,  # Maximum length for generation
                num_return_sequences=1  # Number of sequences to return per input
            ).cpu()
        decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
        english_sentences.extend(decoded)

        del generated_outputs, tokenized_inputs
        torch.cuda.empty_cache()

    synthetic_data = {"translation": []}

    for eng, fr in zip(english_sentences, french_sentences):
        elem = {"en": eng, "fr": fr}
        synthetic_data["translation"].append(elem)

    test_syn_en_to_fr_dataset = Dataset.from_dict(synthetic_data)
    test_syn_en_to_fr_dataset = test_syn_en_to_fr_dataset.map(preprocess_function_en_to_fr, batched=True)
    test_syn_en_to_fr_dataset = test_syn_en_to_fr_dataset.remove_columns("translation")

    return test_syn_en_to_fr_dataset

# new_syn_dataset = get_recursive_data(train_tokenized_dataset)

### Train Model for Recursive Loop

In [None]:
def rec_finetune_model(local_model, name: str, num_epochs: int, train_data, test_data):
    local_training_args = Seq2SeqTrainingArguments(
        output_dir=name,
        eval_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        generation_max_length=32,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    local_trainer = Seq2SeqTrainer(
        model=local_model,
        args=local_training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    local_trainer.train()

    torch.cuda.empty_cache()

### Validation Loop To Get Bleu Score

In [None]:
def eval_loop(local_model, test_data, output_dir = "temp_results", batch_size = 16, max_len = 64):

    test_training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        per_device_eval_batch_size=batch_size,
        generation_max_length=max_len,
        save_total_limit=3,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    test_trainer = Seq2SeqTrainer(
        model=local_model,
        args=test_training_args,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    eval_results = test_trainer.evaluate()

    torch.cuda.empty_cache()

    return eval_results

### Recursive Train Loop

In [None]:
def recursive_train(model, validate_model, train_data, test_data, num_iters, augmentation_ratio = 0.0, train_size = train_smaller_size):
    blue_score = []
    all_scores = []

    real_num_indicies = int(augmentation_ratio * len(train_data))
    syn_data_num_indicies = len(train_data) - real_num_indicies

    validate_model.eval()

    local_train_data = train_data

    for i in range(num_iters):
        torch.cuda.empty_cache()
        print(f"\nIteration {i + 1}/{num_iters}")

        rec_finetune_model(model, f"test_model{i+1}", 1, local_train_data, test_data)

        torch.cuda.empty_cache()

        eval_metrics = eval_loop(model, test_data)

        torch.cuda.empty_cache()

        blue_score.append(eval_metrics["eval_bleu"])
        all_scores.append(eval_metrics)

        if (i+1) == num_iters:
            break

        if real_num_indicies != 0:
            # reduce the size of the train data to let us add the real data later

            random_indices = np.random.choice(len(local_train_data), syn_data_num_indicies, replace=False)
            local_train_data = local_train_data.select(random_indices)

        synn_data = get_recursive_data(local_train_data, model, validate_model, batch_size=256)

        if real_num_indicies != 0:
            # get the next real_num_indicies of actual data to add to the train data
            start_idx = train_size + int((i * real_num_indicies))
            end_idx = train_size + int(((i+1) * real_num_indicies))

            # train_dataset_indicies is a list of all the indicies in train data (~100000 elems)
            # with all the indicies shuffeled
            indicies_to_add = train_dataset_indicies[start_idx:end_idx]

            real_data = complete_train_dataset.select(indicies_to_add)

            local_train_data = concatenate_datasets([real_data, synn_data])
        else:
            local_train_data = synn_data


    return blue_score, all_scores


W&B Token: 34e6eba1f5945844382dc10a5314475fa664ad3b


In [None]:
blue_score, all_scores = recursive_train(model_en_to_fr, model_fr_to_en, train_tokenized_dataset, test_tokenized_dataset, 8, augmentation_ratio=0.95)

In [None]:
blue_score, all_scores

[16.7773, 16.96, 17.102, 17.1087, 17.1861, 17.3486, 17.3265, 17.4914] -> 0.95

### Basic Loop Scores

([16.7773, 15.6103, 14.8468, 14.1085, 13.6272, 13.4341, 13.2735, 13.2776],

 [{'eval_loss': 1.6717504262924194,
   'eval_model_preparation_time': 0.0031,
   'eval_bleu': 16.7773,
   'eval_gen_len': 37.8316,
   'eval_runtime': 137.7467,
   'eval_samples_per_second': 18.149,
   'eval_steps_per_second': 1.14},

  {'eval_loss': 1.7462550401687622,
   'eval_model_preparation_time': 0.0032,
   'eval_bleu': 15.6103,
   'eval_gen_len': 39.468,
   'eval_runtime': 140.3697,
   'eval_samples_per_second': 17.81,
   'eval_steps_per_second': 1.118},

  {'eval_loss': 1.8700231313705444,
   'eval_model_preparation_time': 0.0033,
   'eval_bleu': 14.8468,
   'eval_gen_len': 40.022,
   'eval_runtime': 143.297,
   'eval_samples_per_second': 17.446,
   'eval_steps_per_second': 1.096},

  {'eval_loss': 1.969122052192688,
   'eval_model_preparation_time': 0.0032,
   'eval_bleu': 14.1085,
   'eval_gen_len': 40.418,
   'eval_runtime': 145.2608,
   'eval_samples_per_second': 17.21,
   'eval_steps_per_second': 1.081},

  {'eval_loss': 2.0426764488220215,
   'eval_model_preparation_time': 0.0033,
   'eval_bleu': 13.6272,
   'eval_gen_len': 40.7952,
   'eval_runtime': 146.9739,
   'eval_samples_per_second': 17.01,
   'eval_steps_per_second': 1.068},

  {'eval_loss': 2.107924461364746,
   'eval_model_preparation_time': 0.0032,
   'eval_bleu': 13.4341,
   'eval_gen_len': 40.836,
   'eval_runtime': 150.3491,
   'eval_samples_per_second': 16.628,
   'eval_steps_per_second': 1.044},

  {'eval_loss': 2.1789631843566895,
   'eval_model_preparation_time': 0.0033,
   'eval_bleu': 13.2735,
   'eval_gen_len': 40.6836,
   'eval_runtime': 152.6264,
   'eval_samples_per_second': 16.38,
   'eval_steps_per_second': 1.029},

  {'eval_loss': 2.233154296875,
   'eval_model_preparation_time': 0.0031,
   'eval_bleu': 13.2776,
   'eval_gen_len': 40.4324,
   'eval_runtime': 154.1917,
   'eval_samples_per_second': 16.214,
   'eval_steps_per_second': 1.018}])

### Ideal Ratio Experiments

In [None]:
def test_ratios(model_file, validate_model_file, train_data, test_data, num_iters, ratios_list):
  data = {}

  for ratio in ratios_list:
    print(f"------------ Testing Ratio: {round(ratio, 2)} ------------")

    model = T5ForConditionalGeneration.from_pretrained(f"t5_en_to_fr/{model_file}/").to(device)
    validate_model = T5ForConditionalGeneration.from_pretrained(f"t5_fr_to_en_final/{validate_model_file}/").to(device)

    name = "Ratio_" + str(round(ratio, 2))

    blue_score, all_scores = recursive_train(model, validate_model, train_data, test_data, num_iters, augmentation_ratio = ratio)
    data[name] = (blue_score, all_scores)
    print((blue_score, all_scores))

    torch.cuda.empty_cache()

  return data


In [None]:
ratio_list = [0.25, 0.5, 0.75]

data = test_ratios("checkpoint-6355", "checkpoint-19065", train_tokenized_dataset, test_tokenized_dataset, 6, ratio_list)

with open('ratio_experiments.txt', 'w') as convert_file:
     convert_file.write(json.dumps(data))

data

# Things to do

In-context learning
Add examples of what the translation should do.

Gemini pro gives credit to use as the other model

Look at models that can directly go multi-lingual or just english

Might need a bigger model than t5 small (t5 model will not work with in-context learning)

Start with a 3B model for translation -- assuming we have 48 GB

accelerator --

from_pre-trained - device_map = 'auto' for mapping

quantize models for inference

# Test T5-large
### This never ended up working, even just basic fine-tuning

In [None]:
import argparse
import nltk
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

nltk.download('punkt')

books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2, seed=42)

model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    prefix = "translate English to French: "
    inputs = [prefix + example["en"] for example in examples["translation"]]
    targets = [example["fr"] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

tokenized_books = books.map(preprocess_function, batched=True, remove_columns=books["train"].column_names)

train_dataset = tokenized_books["train"]
test_dataset = tokenized_books["test"]

In [None]:
red_train_dataset = train_dataset.select(np.arange(10000))
red_test_dataset = test_dataset.select(np.arange(2500))

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-large-opus-books-en-fr",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    generation_max_length=32,
    weight_decay=0.01,
    num_train_epochs=2,
    predict_with_generate=True,
    logging_steps=100,
    push_to_hub=False,
    save_total_limit=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=red_train_dataset,
    eval_dataset=red_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model("./t5-large-opus-books-en-fr-final")

torch.cuda.empty_cache()

In [None]:
for example in tokenized_books["train"].select(range(5)):
    print(example)