In [138]:
import datasets
from datasets import load_dataset

In [139]:
# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Here, you should adjust the loading of subsets to avoid redundant downloads or loading.
# Load 50k rows of the train dataset
train_dataset = dataset["train"].select(range(100020))
# train_dataset = dataset["train"].select(range(600))

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [73]:
from transformers import AutoTokenizer, MarianMTModel, AutoModelForSeq2SeqLM

# checkpoint = "google-t5/t5-small"
src = "en"  # source language
trg = "fr"  # target language

checkpoint_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
# checkpoint = MarianMTModel.from_pretrained(checkpoint_name)
checkpoint = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_name)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_name)

In [99]:
test_dataset[:10]['translation']

[{'en': 'Spectacular Wingsuit Jump Over Bogota',
  'fr': 'Spectaculaire saut en "wingsuit" au-dessus de Bogota'},
 {'en': 'Sportsman Jhonathan Florez jumped from a helicopter above Bogota, the capital of Colombia, on Thursday.',
  'fr': "Le sportif Jhonathan Florez a sauté jeudi d'un hélicoptère au-dessus de Bogota, la capitale colombienne."},
 {'en': 'Wearing a wingsuit, he flew past over the famous Monserrate Sanctuary at 160km/h. The sanctuary is located at an altitude of over 3000 meters and numerous spectators had gathered there to watch his exploit.',
  'fr': "Equipé d'un wingsuit (une combinaison munie d'ailes), il est passé à 160 km/h au-dessus du célèbre sanctuaire Monserrate, situé à plus de 3 000 mètres d'altitude, où de nombreux badauds s'étaient rassemblés pour observer son exploit."},
 {'en': 'A black box in your car?',
  'fr': 'Une boîte noire dans votre voiture\xa0?'},
 {'en': "As America's road planners struggle to find the cash to mend a crumbling highway system, many

In [93]:

def preprocess_old(examples):

    prefix = ">>fr<<"
    src = "en"  # source language
    trg = "fr"  # target language

    tokenizer = AutoTokenizer.from_pretrained(checkpoint_name)
    en = [prefix + example[src] for example in examples['translation']]
    fr = [example[trg] for example in examples['translation']]
    print("en", en)

    # Tokenize the text data
    # model_inputs = tokenizer(en, text_target=fr, max_length=128, truncation=True)
    model_inputs = tokenizer(en, text_target=fr, padding='max_length', max_length=128, truncation=True)

    return model_inputs
    

In [121]:
def preprocess(examples, tokenizer):
    prefix = ">>fr<<"
    src = "en"  # source language
    trg = "fr"  # target language

    en_texts = [prefix + example[src] for example in examples['translation']]
    fr_texts = [example[trg] for example in examples['translation']]

    # Tokenize the English texts
    model_inputs = tokenizer(en_texts, padding='max_length', max_length=128, truncation=True)

    # Tokenize the French texts separately to create the labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(fr_texts, padding='max_length', max_length=128, truncation=True)["input_ids"]

    # Set labels in model_inputs
    model_inputs["labels"] = labels
    return model_inputs


In [122]:
tokenized_train_datasets = train_dataset.map(preprocess, batched=True, fn_kwargs={'tokenizer': tokenizer})
tokenized_valid_datasets = valid_dataset.map(preprocess, batched=True, fn_kwargs={'tokenizer': tokenizer})
tokenized_test_datasets = test_dataset.map(preprocess, batched=True, fn_kwargs={'tokenizer': tokenizer})

In [123]:
tokenized_train_datasets

Dataset({
    features: ['translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 600
})

In [124]:
type(tokenized_train_datasets)

datasets.arrow_dataset.Dataset

In [None]:
tokenized_train_datasets[0]['labels']

In [129]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_name)

In [131]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [132]:
import os
getpwd = os.getcwd()
src = "en"  # source language
trg = "fr"  # target language

checkpoint_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"

# model_output_name = checkpoint_name + "_wmt14_En_Fr_1million_20epochs"
model_output_name = checkpoint_name + "_wmt14_En_Fr_600sents_2epc"
output_dir = os.path.join(getpwd, "checkpoints", model_output_name)

In [133]:
# model_output_name
output_dir

'/home/paperspace/google_drive_v4/Research_Thesis/2024/Adversarial_NMT_th/checkpoints/Helsinki-NLP/opus-mt-en-fr_wmt14_En_Fr_600sents_2epc'

In [134]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint_name, max_length=128)

In [135]:
import evaluate

metric = evaluate.load("sacrebleu")

In [136]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [137]:
training_args = Seq2SeqTrainingArguments(
    # output_dir="/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million",
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.




Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.404811,0.0049,511.0


KeyboardInterrupt: 

In [None]:
trainer.save_model(output_dir)

# OLD

In [21]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [22]:
tokenized_train_datasets = train_dataset.map(preprocess_function, batched=True)
tokenized_valid_datasets = valid_dataset.map(preprocess_function, batched=True)
tokenized_test_datasets = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/100020 [00:00<?, ? examples/s]

In [23]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, max_length=128)

In [24]:
import evaluate

metric = evaluate.load("sacrebleu")

In [25]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [26]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [27]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
# import transformers

# print(transformers.__version__)

In [44]:
import os
getpwd = os.getcwd()
model_output_name = "T5_wmt14_En_Fr_1million"
output_dir = os.path.join(getpwd, "checkpoints", model_output_name)

In [39]:
# os.path.join(getpwd + "/checkpoints/" + model_output_name)

'/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million'

In [45]:
output_dir

'/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million'

In [29]:
training_args = Seq2SeqTrainingArguments(
    # output_dir="/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million",
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million")

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.0796,1.187222,9.2959,18.0253
2,1.01,1.202858,9.1594,18.0187
3,0.9686,1.211359,9.2836,18.0123
4,0.9366,1.226113,9.18,17.995
5,0.8999,1.23188,9.2754,17.9793
6,0.8769,1.241295,9.1705,18.026
7,0.8536,1.250221,9.036,17.9987
8,0.8273,1.263332,9.2003,18.006
9,0.8125,1.274042,9.0991,18.009
10,0.7905,1.283455,8.9005,18.007




pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]