# Reference

This code is copied from the [hugging face translation tutorial](https://huggingface.co/docs/transformers/en/tasks/translation) 

In [27]:
import torch
from torch.utils.data import Dataset, DataLoader
import evaluate
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5ForConditionalGeneration, T5TokenizerFast, GenerationConfig

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [2]:
books = load_dataset("opus_books", "en-fr")

In [3]:
books = books["train"].train_test_split(test_size=0.2, seed=42)

In [4]:
books["train"][0:3]["translation"]

[{'en': 'What a stroke was this for poor Jane! who would willingly have gone through the world without believing that so much wickedness existed in the whole race of mankind, as was here collected in one individual.',
  'fr': 'Quel coup pour la pauvre Jane qui aurait parcouru le monde entier sans s’imaginer qu’il existât dans toute l’humanité autant de noirceur qu’elle en découvrait en ce moment dans un seul homme !'},
 {'en': "The ground rose appreciably as it moved away from the sand flats by the waves, and we soon arrived at some long, winding gradients, genuinely steep paths that allowed us to climb little by little; but we had to tread cautiously in the midst of pudding stones that weren't cemented together, and our feet kept skidding on glassy trachyte, made of feldspar and quartz crystals.",
  'fr': "Le sol s'élevait sensiblement en s'éloignant du relais des flots, et nous Mmes bientôt arrivés à des rampes longues et sinueuses, véritables raidillons qui permettaient de s'élever 

In [5]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
def preprocess_function_en_to_fr(examples):
    source_lang = "en"
    target_lang = "fr"
    prefix = "translate English to French: "

    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

def preprocess_function_fr_to_en(examples):
    source_lang = "fr"
    target_lang = "en"
    prefix = "translate French to English: "
    
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [173]:
books["train"]

Dataset({
    features: ['id', 'translation'],
    num_rows: 101668
})

In [7]:
tokenized_books_en_to_fr = books.map(preprocess_function_en_to_fr, batched=True)
tokenized_books_fr_to_en = books.map(preprocess_function_fr_to_en, batched=True)

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [12]:
metric = evaluate.load("sacrebleu")

In [13]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result



In [14]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_opus_books_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books_en_to_fr["train"],
    eval_dataset=tokenized_books_en_to_fr["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train()

In [16]:
def finetune_model(local_model, name: str, num_epochs: int, tokenized_dataset):
    local_training_args = Seq2SeqTrainingArguments(
        output_dir=name,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=True, #change to bf16=True for XPU
        push_to_hub=False,
    )

    local_trainer = Seq2SeqTrainer(
        model=local_model,
        args=local_training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    local_trainer.train()

# Inference

In [18]:
model = T5ForConditionalGeneration.from_pretrained("my_awesome_opus_books_model/checkpoint-12710/").to(device)

In [19]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)

In [20]:
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
tokenizer.decode(outputs[0], skip_special_tokens=True)


'Les légumes partagent leurs ressources avec des bactéries fixatrices d’azote.'

# Create base French to English Model

In [17]:
checkpoint = "google-t5/t5-small"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model_fr_to_en = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
finetune_model(model_fr_to_en, "t5_fr_to_en_final", 2, tokenized_books_fr_to_en)

# Create base English to French Model

In [167]:
checkpoint = "google-t5/t5-small"

model_en_to_fr = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [168]:
finetune_model(model_en_to_fr, "t5_en_to_fr", 1, tokenized_books_en_to_fr)

  0%|          | 0/6355 [00:00<?, ?it/s]

{'loss': 2.0946, 'grad_norm': 1.380620002746582, 'learning_rate': 1.8426435877261997e-05, 'epoch': 0.08}
{'loss': 2.0035, 'grad_norm': 1.4117834568023682, 'learning_rate': 1.6852871754524e-05, 'epoch': 0.16}
{'loss': 1.9848, 'grad_norm': 1.416219711303711, 'learning_rate': 1.5279307631785996e-05, 'epoch': 0.24}
{'loss': 1.9492, 'grad_norm': 1.4263882637023926, 'learning_rate': 1.3705743509047995e-05, 'epoch': 0.31}
{'loss': 1.9197, 'grad_norm': 1.7879889011383057, 'learning_rate': 1.213532651455547e-05, 'epoch': 0.39}
{'loss': 1.9166, 'grad_norm': 1.1198539733886719, 'learning_rate': 1.0561762391817467e-05, 'epoch': 0.47}
{'loss': 1.9146, 'grad_norm': 1.347274661064148, 'learning_rate': 8.988198269079466e-06, 'epoch': 0.55}
{'loss': 1.9027, 'grad_norm': 1.2819278240203857, 'learning_rate': 7.414634146341464e-06, 'epoch': 0.63}
{'loss': 1.8993, 'grad_norm': 1.3830379247665405, 'learning_rate': 5.844217151848939e-06, 'epoch': 0.71}
{'loss': 1.901, 'grad_norm': 1.3655177354812622, 'learni



  0%|          | 0/1589 [00:00<?, ?it/s]

{'eval_loss': 1.6593308448791504, 'eval_bleu': 5.224, 'eval_gen_len': 17.6479, 'eval_runtime': 366.7158, 'eval_samples_per_second': 69.31, 'eval_steps_per_second': 4.333, 'epoch': 1.0}
{'train_runtime': 976.967, 'train_samples_per_second': 104.065, 'train_steps_per_second': 6.505, 'train_loss': 1.9363052284126672, 'epoch': 1.0}


# Recursive Training

In [18]:
def generate_synthetic_data(model, dataset, tokenizer, validate_model, gen_lang="en", inter_lang = "fr", batch_size = 16):
    model.eval()  # Set model to evaluation mode
    synthetic_data = {"translation": {"en": [], "fr": []}}

    gen_lang_msg = "translate English to French: "
    inter_lang_msg = "translate French to English: "
    if gen_lang == "fr":
        gen_lang_msg, inter_lang_msg = inter_lang_msg, gen_lang_msg

    tokenized_datasets = dataset.remove_columns(books["train"].column_names)
    train_loader = DataLoader(tokenized_datasets, batch_size = batch_size, collate_fn = data_collator)

    for i, tokenized_inputs in enumerate(train_loader):
        with torch.no_grad():
            generated_outputs = model.generate(
                input_ids=tokenized_inputs['input_ids'],
                attention_mask=tokenized_inputs['attention_mask'],
                max_length=128,  # Maximum length for generation
                num_return_sequences=1  # Number of sequences to return per input
            )
        tokenizer.batch_decode(generated_outputs)


    for i in range(len(dataset)):
        source_text = gen_lang_msg + dataset[i]['translation'][gen_lang]
        inputs = tokenizer(source_text, return_tensors="pt", max_length=128, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=128)
        inter_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        inter_lang_text = inter_lang_msg + inter_translation
        inter_inputs = tokenizer(inter_lang_text, return_tensors="pt", max_length=128, truncation=True).to(device)

        with torch.no_grad():
            final_out = validate_model.generate(**inter_inputs, max_length=128)
        gen_lang_translation = tokenizer.decode(final_out, skip_special_tokens=True)

        # Add generated synthetic data
        synthetic_data["translation"][gen_lang].append(gen_lang_translation)
        synthetic_data["translation"][inter_lang].append(inter_translation)

    return Dataset.from_dict(synthetic_data)

In [56]:
local_checkpoint = "checkpoint-19065"

model_fr_to_en = T5ForConditionalGeneration.from_pretrained(f"t5_fr_to_en/{local_checkpoint}/").to(device)
# tokenizer_fr_to_en = T5TokenizerFast.from_pretrained(f"t5_fr_to_en/{local_checkpoint}/")

In [54]:
index = 70

fr_sent = tokenized_books_fr_to_en["test"][index]["translation"]['fr']
text = f"translate French to English: {fr_sent}"
print(text, tokenized_books_fr_to_en["test"][index]["translation"]['en'], sep="\n")
inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)

translate French to English: -- C'est précisément pour ce motif, Sir John, que je mise à deux contre un en sa faveur.
"It is precisely for that reason, Sir John, that I am laying odds of two to one in his favour."


In [55]:
outputs = model_fr_to_en.generate(inputs, max_new_tokens=40)

tokenizer.decode(outputs[0], skip_special_tokens=True)

'"It is precisely for this motif, Sir John, that I put two against one in his favor."'

In [62]:
tokenizer.decode(tokenized_books_en_to_fr["train"][0]["input_ids"], skip_special_tokens=True)

'translate English to French: What a stroke was this for poor Jane! who would willingly have gone through the world without believing that so much wickedness existed in the whole race of mankind, as was here collected in one individual.'

In [75]:
source_text = tokenized_books_en_to_fr["train"][0]["translation"]['fr']
inputs = tokenizer(source_text, return_tensors="pt", max_length=128, truncation=True).to(device)
outputs = model_fr_to_en.generate(**inputs, max_length=128)
generated_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_translation

'What coup for the poor Jane who has wandered the world in a whole world without imagining that it exists in any humanity as noirceur as she finds in this moment in a single man?'

In [None]:
def recursive_train(model, data, validate_model, num_iters):
    validate_model.eval()

    for i in range(num_iters):
        print(f"\nIteration {i + 1}/{num_iters}")

        # train model for 1 epoch
        finetune_model(model, f"test_model{i+1}", 1, data)

        synth_data = generate_synthetic_data(model, data["train"], tokenizer, validate_model)

        synth_dataset = synth_data.map(preprocess_function_en_to_fr, batched=True)

        data["train"] = synth_dataset

        torch.cuda.empty_cache()



In [26]:
tokenized_books_fr_to_en["train"][""]

Dataset({
    features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 101668
})

In [29]:
# class TranslationDataset(torch.utils.data.Dataset):
#     def __init__(self, dataset, source="en", target="fr"):
#         self.dataset = dataset

#     def __len__(self):
#         return len(self.dataset)

#     def __getitem__(self, idx):
#         item = self.dataset[idx]
#         input = 



In [169]:
tokenized_test = tokenized_books_en_to_fr["train"]

tokenized_datasets = tokenized_test.remove_columns(books["train"].column_names)

In [170]:
loader = DataLoader(tokenized_datasets, batch_size = 16, collate_fn = data_collator, pin_memory=True)

In [190]:
from datasets import concatenate_datasets

dataset_list = []
english_sentences = []
french_sentences = []

for i, tokenized_inputs in enumerate(loader):
    with torch.no_grad():
        generated_outputs = model_en_to_fr.generate(
            input_ids=tokenized_inputs['input_ids'].to(device),
            attention_mask=tokenized_inputs['attention_mask'].to(device),
            max_length=128,  # Maximum length for generation
            num_return_sequences=1  # Number of sequences to return per input
        )

    decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
    english_sentences.extend(decoded)
    decoded = list(map(lambda s: "translate French to English: " + s, decoded))

    encoded = tokenizer(decoded, return_tensors="pt", max_length=128, padding=True, truncation=True)
    dataset_list.append(Dataset.from_dict(encoded))
    
    if i == 4:
        break

test_dataset = concatenate_datasets(dataset_list)

loader2 = DataLoader(test_dataset, batch_size = 16, collate_fn = data_collator, pin_memory=True)

for i, tokenized_inputs in enumerate(loader2):
    with torch.no_grad():
        generated_outputs = model_fr_to_en.generate(
            input_ids=tokenized_inputs['input_ids'].to(device),
            attention_mask=tokenized_inputs['attention_mask'].to(device),
            max_length=128,  # Maximum length for generation
            num_return_sequences=1  # Number of sequences to return per input
        )
    decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
    french_sentences.extend(decoded)

synthetic_data = {"translation": []}

for eng, fr in zip(english_sentences, french_sentences):
    elem = {"en": eng, "fr": fr}
    synthetic_data["translation"].append(elem)

[{'en': "Ce fut un coup pour la pauvre Jane! qui aurait volontairement traversé le monde sans croire que l'humanité avait tant de méchantes, comme c'était ici rassemblée dans un seul individu.",
  'fr': 'This was a coup for the poor Jane! who would have deliberately traversed the world without believing that the humanity had tant of minglings, as it had gathered in one individual.'},
 {'en': "Le sol s'élevait appréciablement en s'éloignant des plats de sable par les vagues, et nous arrivons bientôt à quelques longs gradients en vent, véritablement abrupts, qui nous permettent de grimper peu à peu; mais nous devions s'en tenir prudentement au milieu de pierres de pudding qui n'étaient pas cédées ensemble, et nos pieds s'enfonçait en trachyte en verre, ",
  'fr': 'The soil was a little awry, and we were soon able to reach some long gradients in vent, which we were a little snoring; but we sat prudently in the middle of a stone of pudding which were not coiled together, and our feet were 

In [None]:
for val in test_dataset["input_ids"]:
    print(len(val))

In [128]:
for i in range(len(test_dataset)):
    print(test_dataset["input_ids"][i], test_dataset["attention_mask"][i])

[[363, 4897, 21, 8, 2714, 8158, 113, 133, 43, 10735, 15, 26, 8, 296, 16, 423, 406, 1631, 24, 3, 88, 16415, 16, 66, 8, 12540, 38, 2164, 38, 255, 141, 3883, 16, 48, 798, 16, 3, 9, 712, 388, 55, 1]] [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[[37, 9467, 47, 11743, 120, 3, 7, 40, 3868, 45, 8, 3, 60, 7, 9, 109, 13, 8, 8882, 17, 7, 6, 11, 62, 130, 1116, 1107, 12, 8, 307, 11, 3731, 13281, 10785, 7, 6, 490, 15941, 8901, 7, 84, 2225, 178, 12, 3, 7, 40, 265, 385, 12, 385, 6, 68, 3, 88, 4728, 12, 8, 2214, 13, 175, 3, 18, 975, 122, 17551, 144, 7, 6, 84, 150, 3, 75, 23, 297, 3, 7361, 344, 135, 6, 11, 8, 2418, 3, 25951, 147, 135, 5, 1]] [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[[299, 1919, 