# Reference

This code is copied from the [hugging face translation tutorial](https://huggingface.co/docs/transformers/en/tasks/translation) 

In [1]:
import torch
from tqdm.notebook import trange, tqdm
from torch.utils.data import Dataset, DataLoader
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5ForConditionalGeneration, T5TokenizerFast, GenerationConfig

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [2]:
books = load_dataset("opus_books", "en-fr")

In [3]:
books = books["train"].train_test_split(test_size=0.2, seed=42)

In [4]:
books["train"][0:3]["translation"]

[{'en': 'What a stroke was this for poor Jane! who would willingly have gone through the world without believing that so much wickedness existed in the whole race of mankind, as was here collected in one individual.',
  'fr': 'Quel coup pour la pauvre Jane qui aurait parcouru le monde entier sans s’imaginer qu’il existât dans toute l’humanité autant de noirceur qu’elle en découvrait en ce moment dans un seul homme !'},
 {'en': "The ground rose appreciably as it moved away from the sand flats by the waves, and we soon arrived at some long, winding gradients, genuinely steep paths that allowed us to climb little by little; but we had to tread cautiously in the midst of pudding stones that weren't cemented together, and our feet kept skidding on glassy trachyte, made of feldspar and quartz crystals.",
  'fr': "Le sol s'élevait sensiblement en s'éloignant du relais des flots, et nous Mmes bientôt arrivés à des rampes longues et sinueuses, véritables raidillons qui permettaient de s'élever 

In [5]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
def preprocess_function_en_to_fr(examples):
    source_lang = "en"
    target_lang = "fr"
    prefix = "translate English to French: "

    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

def preprocess_function_fr_to_en(examples):
    source_lang = "fr"
    target_lang = "en"
    prefix = "translate French to English: "
    
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [7]:
books["train"]

Dataset({
    features: ['id', 'translation'],
    num_rows: 101668
})

In [8]:
tokenized_books_en_to_fr = books.map(preprocess_function_en_to_fr, batched=True)
tokenized_books_fr_to_en = books.map(preprocess_function_fr_to_en, batched=True)

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
metric = evaluate.load("sacrebleu")

In [11]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.where(preds < 0, tokenizer.pad_token_id, preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result



In [12]:
def finetune_model(local_model, name: str, num_epochs: int, tokenized_dataset, max_len=32):
    local_training_args = Seq2SeqTrainingArguments(
        output_dir=name,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        generation_max_length=max_len, # can reduce this for faster training, but worse preformance (min = 20)
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    local_trainer = Seq2SeqTrainer(
        model=local_model,
        args=local_training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    local_trainer.train()

# Inference

In [None]:
# model = T5ForConditionalGeneration.from_pretrained("my_awesome_opus_books_model/checkpoint-12710/").to(device)

In [None]:
# text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
# inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)

In [None]:
# outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
# tokenizer.decode(outputs[0], skip_special_tokens=True)


'Les légumes partagent leurs ressources avec des bactéries fixatrices d’azote.'

# Create base French to English Model

In [13]:
checkpoint = "google-t5/t5-small"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model_fr_to_en = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
finetune_model(model_fr_to_en, "t5_fr_to_en_final", 2, tokenized_books_fr_to_en)
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()

# Create base English to French Model

In [167]:
checkpoint = "google-t5/t5-small"

model_en_to_fr = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
finetune_model(model_en_to_fr, "t5_en_to_fr_final", 1, tokenized_books_en_to_fr)
torch.cuda.empty_cache()

  0%|          | 0/6355 [00:00<?, ?it/s]

{'loss': 2.0946, 'grad_norm': 1.380620002746582, 'learning_rate': 1.8426435877261997e-05, 'epoch': 0.08}
{'loss': 2.0035, 'grad_norm': 1.4117834568023682, 'learning_rate': 1.6852871754524e-05, 'epoch': 0.16}
{'loss': 1.9848, 'grad_norm': 1.416219711303711, 'learning_rate': 1.5279307631785996e-05, 'epoch': 0.24}
{'loss': 1.9492, 'grad_norm': 1.4263882637023926, 'learning_rate': 1.3705743509047995e-05, 'epoch': 0.31}
{'loss': 1.9197, 'grad_norm': 1.7879889011383057, 'learning_rate': 1.213532651455547e-05, 'epoch': 0.39}
{'loss': 1.9166, 'grad_norm': 1.1198539733886719, 'learning_rate': 1.0561762391817467e-05, 'epoch': 0.47}
{'loss': 1.9146, 'grad_norm': 1.347274661064148, 'learning_rate': 8.988198269079466e-06, 'epoch': 0.55}
{'loss': 1.9027, 'grad_norm': 1.2819278240203857, 'learning_rate': 7.414634146341464e-06, 'epoch': 0.63}
{'loss': 1.8993, 'grad_norm': 1.3830379247665405, 'learning_rate': 5.844217151848939e-06, 'epoch': 0.71}
{'loss': 1.901, 'grad_norm': 1.3655177354812622, 'learni



  0%|          | 0/1589 [00:00<?, ?it/s]

{'eval_loss': 1.6593308448791504, 'eval_bleu': 5.224, 'eval_gen_len': 17.6479, 'eval_runtime': 366.7158, 'eval_samples_per_second': 69.31, 'eval_steps_per_second': 4.333, 'epoch': 1.0}
{'train_runtime': 976.967, 'train_samples_per_second': 104.065, 'train_steps_per_second': 6.505, 'train_loss': 1.9363052284126672, 'epoch': 1.0}


# Load nessessary models

In [13]:

en_to_fr_checkpoint = "checkpoint-6355"

model_en_to_fr = T5ForConditionalGeneration.from_pretrained(f"t5_en_to_fr/{en_to_fr_checkpoint}/").to(device)

fr_to_en_checkpoint = "checkpoint-12710"

model_fr_to_en = T5ForConditionalGeneration.from_pretrained(f"t5_fr_to_en_final/{fr_to_en_checkpoint}/").to(device)

# Building Recursive Train Loop

### Create Smaller Test and Train Datasets 

In [14]:
np.random.seed(42)

In [15]:
test_dataset_len = tokenized_books_en_to_fr["test"].num_rows
test_smaller_size = 2500

random_indices = np.random.choice(test_dataset_len, test_smaller_size, replace=False)

reduced_tokenized_test = tokenized_books_en_to_fr["test"].select(random_indices)

# need to do this to prevent random errors ¯\_(ツ)_/¯
test_tokenized_dataset = reduced_tokenized_test.remove_columns(books["test"].column_names) 

In [16]:
np.random.seed(84)

In [17]:
dataset_len = tokenized_books_en_to_fr["train"].num_rows
smaller_size = 10000 # 10,000

random_indices = np.random.choice(dataset_len, smaller_size, replace=False)

reduced_tokenized_train = tokenized_books_en_to_fr["train"].select(random_indices)

# need to do this to prevent random errors ¯\_(ツ)_/¯
train_tokenized_dataset = reduced_tokenized_train.remove_columns(books["train"].column_names)

### One Recursive Train loop

In [24]:
def get_recursive_data(train_dataset, model_forward = model_en_to_fr, model_reverse = model_fr_to_en, batch_size = 512):
    torch.cuda.empty_cache()

    dataset_list = []
    english_sentences = []
    french_sentences = []

    loader = DataLoader(train_dataset, batch_size = batch_size, collate_fn = data_collator)

    for i, tokenized_inputs in tqdm(enumerate(loader), total=len(loader), desc = "Forward (EN to FR) Inference Pass"):
        with torch.no_grad():
            generated_outputs = model_forward.generate(
                input_ids=tokenized_inputs['input_ids'].to(device),
                attention_mask=tokenized_inputs['attention_mask'].to(device),
                max_length=128,  # Maximum length for generation
                num_return_sequences=1  # Number of sequences to return per input
            ).cpu()

        decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
        french_sentences.extend(decoded)
        decoded = list(map(lambda s: "translate French to English: " + s, decoded))

        encoded = tokenizer(decoded, return_tensors="pt", max_length=128, padding=True, truncation=True)
        dataset_list.append(Dataset.from_dict(encoded))

        del generated_outputs, tokenized_inputs
        torch.cuda.empty_cache()

    reverse_dataset = concatenate_datasets(dataset_list)

    loader2 = DataLoader(reverse_dataset, batch_size = batch_size, collate_fn = data_collator)

    for i, tokenized_inputs in tqdm(enumerate(loader2), total=len(loader2), desc = "Reverse (FR to EN) Inference Pass"):
        with torch.no_grad():
            generated_outputs = model_reverse.generate(
                input_ids=tokenized_inputs['input_ids'].to(device),
                attention_mask=tokenized_inputs['attention_mask'].to(device),
                max_length=128,  # Maximum length for generation
                num_return_sequences=1  # Number of sequences to return per input
            ).cpu()
        decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
        english_sentences.extend(decoded)

        del generated_outputs, tokenized_inputs
        torch.cuda.empty_cache()

    synthetic_data = {"translation": []}

    for eng, fr in zip(english_sentences, french_sentences):
        elem = {"en": eng, "fr": fr}
        synthetic_data["translation"].append(elem)

    test_syn_en_to_fr_dataset = Dataset.from_dict(synthetic_data)
    test_syn_en_to_fr_dataset = test_syn_en_to_fr_dataset.map(preprocess_function_en_to_fr, batched=True)
    test_syn_en_to_fr_dataset = test_syn_en_to_fr_dataset.remove_columns("translation")

    return test_syn_en_to_fr_dataset

# new_syn_dataset = get_recursive_data(train_tokenized_dataset)

### Train Model for Recursive Loop

In [25]:
def rec_finetune_model(local_model, name: str, num_epochs: int, train_data, test_data):
    local_training_args = Seq2SeqTrainingArguments(
        output_dir=name,
        eval_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        generation_max_length=32,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    local_trainer = Seq2SeqTrainer(
        model=local_model,
        args=local_training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    local_trainer.train()

    torch.cuda.empty_cache()

### Validation Loop To Get Bleu Score

In [26]:
def eval_loop(local_model, test_data, output_dir = "temp_results", batch_size = 16):

    test_training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        per_device_eval_batch_size=batch_size,
        generation_max_length=64,
        save_total_limit=3,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    test_trainer = Seq2SeqTrainer(
        model=local_model,
        args=test_training_args,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    eval_results = test_trainer.evaluate()

    torch.cuda.empty_cache()

    return eval_results

In [47]:
eval_loop(model_en_to_fr, test_tokenized_dataset)

  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 1.6781498193740845,
 'eval_model_preparation_time': 0.0082,
 'eval_bleu': 16.6539,
 'eval_gen_len': 37.8996,
 'eval_runtime': 116.8829,
 'eval_samples_per_second': 21.389,
 'eval_steps_per_second': 1.343}

# Recursive Train Loop

In [27]:
def recursive_train(model, validate_model, train_data, test_data, num_iters):
    blue_score = []
    all_scores = []

    validate_model.eval()

    local_train_data = train_data

    for i in range(num_iters):
        torch.cuda.empty_cache()
        print(f"\nIteration {i + 1}/{num_iters}")

        rec_finetune_model(model, f"test_model{i+1}", 1, local_train_data, test_data)

        torch.cuda.empty_cache()

        local_train_data = get_recursive_data(local_train_data, model, validate_model, batch_size=256)

        torch.cuda.empty_cache()

        eval_metrics = eval_loop(model, test_data)

        blue_score.append(eval_metrics["eval_bleu"])
        all_scores.append(eval_metrics)
    
    return blue_score, all_scores
        

In [22]:
recursive_train(model_en_to_fr, model_fr_to_en, train_tokenized_dataset, test_tokenized_dataset, 2)


Iteration 1/2


  0%|          | 0/625 [00:00<?, ?it/s]

{'loss': 1.8789, 'grad_norm': 1.4006133079528809, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 1.672284722328186, 'eval_bleu': 16.8545, 'eval_gen_len': 37.8672, 'eval_runtime': 111.3416, 'eval_samples_per_second': 22.453, 'eval_steps_per_second': 1.41, 'epoch': 1.0}
{'train_runtime': 173.5248, 'train_samples_per_second': 57.629, 'train_steps_per_second': 3.602, 'train_loss': 1.8758791015625, 'epoch': 1.0}


Forward (EN to FR) Inference Pass:   0%|          | 0/79 [00:00<?, ?it/s]

Reverse (FR to EN) Inference Pass:   0%|          | 0/79 [00:00<?, ?it/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/157 [00:00<?, ?it/s]


Iteration 2/2


  0%|          | 0/625 [00:00<?, ?it/s]

{'loss': 1.038, 'grad_norm': 1.098865032196045, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 1.7507448196411133, 'eval_bleu': 16.2437, 'eval_gen_len': 38.5548, 'eval_runtime': 113.3231, 'eval_samples_per_second': 22.061, 'eval_steps_per_second': 1.385, 'epoch': 1.0}
{'train_runtime': 173.4446, 'train_samples_per_second': 57.655, 'train_steps_per_second': 3.603, 'train_loss': 1.0324531372070314, 'epoch': 1.0}


Forward (EN to FR) Inference Pass:   0%|          | 0/79 [00:00<?, ?it/s]

Reverse (FR to EN) Inference Pass:   0%|          | 0/79 [00:00<?, ?it/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/157 [00:00<?, ?it/s]

([16.8545, 16.2437],
 [{'eval_loss': 1.672284722328186,
   'eval_model_preparation_time': 0.0004,
   'eval_bleu': 16.8545,
   'eval_gen_len': 37.8672,
   'eval_runtime': 109.7487,
   'eval_samples_per_second': 22.779,
   'eval_steps_per_second': 1.431},
  {'eval_loss': 1.7507448196411133,
   'eval_model_preparation_time': 0.003,
   'eval_bleu': 16.2437,
   'eval_gen_len': 38.5548,
   'eval_runtime': 108.9702,
   'eval_samples_per_second': 22.942,
   'eval_steps_per_second': 1.441}])