# Reference

This code is copied from the [hugging face translation tutorial](https://huggingface.co/docs/transformers/en/tasks/translation) 

In [None]:
import torch
from tqdm.notebook import trange, tqdm
from torch.utils.data import Dataset, DataLoader
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5ForConditionalGeneration, T5TokenizerFast, GenerationConfig

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

In [None]:
books = load_dataset("opus_books", "en-fr")

In [None]:
books = books["train"].train_test_split(test_size=0.2, seed=42)

In [None]:
books["train"][0:3]["translation"]

In [None]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def preprocess_function_en_to_fr(examples):
    source_lang = "en"
    target_lang = "fr"
    prefix = "translate English to French: "

    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

def preprocess_function_fr_to_en(examples):
    source_lang = "fr"
    target_lang = "en"
    prefix = "translate French to English: "
    
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
books["train"]

In [None]:
tokenized_books_en_to_fr = books.map(preprocess_function_en_to_fr, batched=True)
tokenized_books_fr_to_en = books.map(preprocess_function_fr_to_en, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.where(preds < 0, tokenizer.pad_token_id, preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result



In [None]:
def finetune_model(local_model, name: str, num_epochs: int, tokenized_dataset, max_len=32):
    local_training_args = Seq2SeqTrainingArguments(
        output_dir=name,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        generation_max_length=max_len, # can reduce this for faster training, but worse preformance (min = 20)
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    local_trainer = Seq2SeqTrainer(
        model=local_model,
        args=local_training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    local_trainer.train()

# Inference

In [None]:
# model = T5ForConditionalGeneration.from_pretrained("my_awesome_opus_books_model/checkpoint-12710/").to(device)

In [None]:
# text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
# inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)

In [None]:
# outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
# tokenizer.decode(outputs[0], skip_special_tokens=True)


# Create base French to English Model

In [None]:
checkpoint = "google-t5/t5-small"

model_fr_to_en = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
finetune_model(model_fr_to_en, "t5_fr_to_en_final", 2, tokenized_books_fr_to_en)
torch.cuda.empty_cache()

# Create base English to French Model

In [None]:
checkpoint = "google-t5/t5-small"

model_en_to_fr = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
finetune_model(model_en_to_fr, "t5_en_to_fr", 1, tokenized_books_en_to_fr)
torch.cuda.empty_cache()

# Load nessessary models

In [None]:
en_to_fr_checkpoint = "checkpoint-6355"

model_en_to_fr = T5ForConditionalGeneration.from_pretrained(f"t5_en_to_fr/{en_to_fr_checkpoint}/").to(device)

fr_to_en_checkpoint = "checkpoint-12710"

model_fr_to_en = T5ForConditionalGeneration.from_pretrained(f"t5_fr_to_en_final/{fr_to_en_checkpoint}/").to(device)

# Building Recursive Train Loop

### Create Smaller Test and Train Datasets 

In [None]:
np.random.seed(42)

In [None]:
test_dataset_len = tokenized_books_en_to_fr["test"].num_rows
test_smaller_size = 2500

random_indices = np.random.choice(test_dataset_len, test_smaller_size, replace=False)

reduced_tokenized_test = tokenized_books_en_to_fr["test"].select(random_indices)

# need to do this to prevent random errors ¯\_(ツ)_/¯
test_tokenized_dataset = reduced_tokenized_test.remove_columns(books["test"].column_names) 

In [None]:
np.random.seed(84)

In [None]:
dataset_len = tokenized_books_en_to_fr["train"].num_rows
smaller_size = 10000 # 10,000

random_indices = np.random.choice(dataset_len, smaller_size, replace=False)

reduced_tokenized_train = tokenized_books_en_to_fr["train"].select(random_indices)

# need to do this to prevent random errors ¯\_(ツ)_/¯
train_tokenized_dataset = reduced_tokenized_train.remove_columns(books["train"].column_names)

### One Recursive Train loop

In [None]:
def get_recursive_data(train_dataset, model_forward = model_en_to_fr, model_reverse = model_fr_to_en, batch_size = 512):
    torch.cuda.empty_cache()

    dataset_list = []
    english_sentences = []
    french_sentences = []

    loader = DataLoader(train_dataset, batch_size = batch_size, collate_fn = data_collator)

    for i, tokenized_inputs in tqdm(enumerate(loader), total=len(loader), desc = "Forward (EN to FR) Inference Pass"):
        with torch.no_grad():
            generated_outputs = model_forward.generate(
                input_ids=tokenized_inputs['input_ids'].to(device),
                attention_mask=tokenized_inputs['attention_mask'].to(device),
                max_length=128,  # Maximum length for generation
                num_return_sequences=1  # Number of sequences to return per input
            ).cpu()

        decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
        french_sentences.extend(decoded)
        decoded = list(map(lambda s: "translate French to English: " + s, decoded))

        encoded = tokenizer(decoded, return_tensors="pt", max_length=128, padding=True, truncation=True)
        dataset_list.append(Dataset.from_dict(encoded))

        del generated_outputs, tokenized_inputs
        torch.cuda.empty_cache()

    reverse_dataset = concatenate_datasets(dataset_list)

    loader2 = DataLoader(reverse_dataset, batch_size = batch_size, collate_fn = data_collator)

    for i, tokenized_inputs in tqdm(enumerate(loader2), total=len(loader2), desc = "Reverse (FR to EN) Inference Pass"):
        with torch.no_grad():
            generated_outputs = model_reverse.generate(
                input_ids=tokenized_inputs['input_ids'].to(device),
                attention_mask=tokenized_inputs['attention_mask'].to(device),
                max_length=128,  # Maximum length for generation
                num_return_sequences=1  # Number of sequences to return per input
            ).cpu()
        decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
        english_sentences.extend(decoded)

        del generated_outputs, tokenized_inputs
        torch.cuda.empty_cache()

    synthetic_data = {"translation": []}

    for eng, fr in zip(english_sentences, french_sentences):
        elem = {"en": eng, "fr": fr}
        synthetic_data["translation"].append(elem)

    test_syn_en_to_fr_dataset = Dataset.from_dict(synthetic_data)
    test_syn_en_to_fr_dataset = test_syn_en_to_fr_dataset.map(preprocess_function_en_to_fr, batched=True)
    test_syn_en_to_fr_dataset = test_syn_en_to_fr_dataset.remove_columns("translation")

    return test_syn_en_to_fr_dataset

# new_syn_dataset = get_recursive_data(train_tokenized_dataset)

### Train Model for Recursive Loop

In [None]:
def rec_finetune_model(local_model, name: str, num_epochs: int, train_data, test_data):
    local_training_args = Seq2SeqTrainingArguments(
        output_dir=name,
        eval_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        generation_max_length=32,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    local_trainer = Seq2SeqTrainer(
        model=local_model,
        args=local_training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    local_trainer.train()

    torch.cuda.empty_cache()

### Validation Loop To Get Bleu Score

In [None]:
def eval_loop(local_model, test_data, output_dir = "temp_results", batch_size = 16, max_len = 64):

    test_training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        per_device_eval_batch_size=batch_size,
        generation_max_length=max_len,
        save_total_limit=3,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    test_trainer = Seq2SeqTrainer(
        model=local_model,
        args=test_training_args,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    eval_results = test_trainer.evaluate()

    torch.cuda.empty_cache()

    return eval_results

### Recursive Train Loop

In [None]:
def recursive_train(model, validate_model, train_data, test_data, num_iters):
    blue_score = []
    all_scores = []

    validate_model.eval()

    local_train_data = train_data

    for i in range(num_iters):
        torch.cuda.empty_cache()
        print(f"\nIteration {i + 1}/{num_iters}")

        rec_finetune_model(model, f"test_model{i+1}", 1, local_train_data, test_data)

        torch.cuda.empty_cache()

        local_train_data = get_recursive_data(local_train_data, model, validate_model, batch_size=256)

        torch.cuda.empty_cache()

        eval_metrics = eval_loop(model, test_data)

        blue_score.append(eval_metrics["eval_bleu"])
        all_scores.append(eval_metrics)
    
    return blue_score, all_scores
        

# Run Recursive Training

In [None]:
recursive_train(model_en_to_fr, model_fr_to_en, train_tokenized_dataset, test_tokenized_dataset, 2)