In [39]:
import datasets
from datasets import load_dataset

In [40]:
# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Here, you should adjust the loading of subsets to avoid redundant downloads or loading.
# Load 50k rows of the train dataset
train_dataset = dataset["train"].select(range(100020))
# train_dataset = dataset["train"].select(range(600))

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

Checking cache at: /root/.cache/huggingface/datasets/wmt14/fr-en
Dataset already downloaded, loading from cache.


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [41]:
from transformers import AutoTokenizer, MarianMTModel, AutoModelForSeq2SeqLM

# checkpoint = "google-t5/t5-small"
src = "en"  # source language
trg = "fr"  # target language

checkpoint_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
# checkpoint = MarianMTModel.from_pretrained(checkpoint_name)
checkpoint = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_name)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_name)

In [42]:
checkpoint_name

'Helsinki-NLP/opus-mt-en-fr'

In [43]:
def preprocess(examples, tokenizer):
    # prefix = ">>fr<<"
    prefix = ""
    src = "en"  # source language
    trg = "fr"  # target language

    en_texts = [prefix + example[src] for example in examples['translation']]
    # en_texts = [example[src] for example in examples['translation']]
    fr_texts = [example[trg] for example in examples['translation']]

    # Tokenize the English texts
    # model_inputs = tokenizer(en_texts, padding='max_length', max_length=128, truncation=True)
    model_inputs = tokenizer(en_texts, max_length=128, truncation=True)

    # Tokenize the French texts separately to create the labels
    with tokenizer.as_target_tokenizer():
        # labels = tokenizer(fr_texts, padding='max_length', max_length=128, truncation=True)["input_ids"]
        labels = tokenizer(fr_texts, max_length=128, truncation=True)["input_ids"]

    # Set labels in model_inputs
    model_inputs["labels"] = labels
    return model_inputs


In [44]:
tokenized_train_datasets = train_dataset.map(preprocess, batched=True, fn_kwargs={'tokenizer': tokenizer})
tokenized_valid_datasets = valid_dataset.map(preprocess, batched=True, fn_kwargs={'tokenizer': tokenizer})
tokenized_test_datasets = test_dataset.map(preprocess, batched=True, fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/100020 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [45]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint_name, max_length=128)

In [46]:
import evaluate

metric = evaluate.load("sacrebleu")

In [47]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [48]:
checkpoint_name

'Helsinki-NLP/opus-mt-en-fr'

In [49]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_name)

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [50]:
import os
getpwd = os.getcwd()
# src = "en"  # source language
# trg = "fr"  # target language

# checkpoint_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"

model_output_name = checkpoint_name + "_wmt14_En_Fr_1million_20epochs_v2"
# model_output_name = checkpoint_name + "_wmt14_En_Fr_600sents_2epc"
output_dir = os.path.join(getpwd, "checkpoints", model_output_name)

In [51]:
# model_output_name
output_dir

'/workspace/2024/Adversarial_NMT_th/checkpoints/Helsinki-NLP/opus-mt-en-fr_wmt14_En_Fr_1million_20epochs_v2'

In [54]:
training_args = Seq2SeqTrainingArguments(
    # output_dir="/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million",
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    # fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,6.3966,10.409004,0.0,511.0




TrainOutput(global_step=1667, training_loss=6.4152334816239875, metrics={'train_runtime': 1409.7605, 'train_samples_per_second': 70.948, 'train_steps_per_second': 1.182, 'total_flos': 2357339966668800.0, 'train_loss': 6.4152334816239875, 'epoch': 1.0})

In [14]:
trainer.save_model(output_dir)

pytorch_model.bin:   0%|          | 0.00/299M [00:00<?, ?B/s]

# Colab FineTuning

In [None]:
import os
os.environ["WANDB_DISABLED"]="true"

In [55]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

In [56]:
from datasets import load_dataset, load_metric
# raw_datasets = load_dataset("wmt16", "ro-en")
metric = load_metric("sacrebleu")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [57]:
# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Here, you should adjust the loading of subsets to avoid redundant downloads or loading.
# Load 50k rows of the train dataset
train_dataset = dataset["train"].select(range(100020))
# train_dataset = dataset["train"].select(range(600))

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

Checking cache at: /root/.cache/huggingface/datasets/wmt14/fr-en
Dataset already downloaded, loading from cache.


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [58]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))
show_random_elements(dataset["train"])

Unnamed: 0,translation
0,"{'en': 'Committee for Programme and Coordination', 'fr': 'Comité du programme et de la coordination'}"
1,"{'en': 'The problem is to have broader questions with alternatives that can give us at least a broad idea of what everybody thinks.', 'fr': 'Il s'agit, au contraire, d'avoir des questions plus vastes avec des alternatives susceptibles de nous donner une idée générale de ce que chacun pense.'}"
2,"{'en': 'Link to latest Business Outlook on the Web.', 'fr': 'Cliquer ici pour l'Analyse des perspectives du marché.'}"
3,"{'en': 'Advances to the working capital reserve were to be made in accordance with the regular budget rates of assessment applicable for 2007 in the scale of assessments for the period 2007-2009.', 'fr': 'Les avances à la réserve opérationnelle seraient versées selon les taux qui avaient été fixés pour les contributions au budget ordinaire en 2007 dans le barème des quotes-parts en vigueur pour la période 2007-2009.'}"
4,"{'en': 'CEDAW/C/SR.803 (A) Meeting held on 1 August 2007, 10 a.m. [A C E F R S]', 'fr': 'CEDAW/C/SR.803 (A) Séance tenue le 1er août 2007, 10 heures [A A C E F R]'}"


In [59]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "fr"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [60]:
train_tokenized_datasets = train_dataset.map(preprocess_function, batched=True)
valid_tokenized_datasets = valid_dataset.map(preprocess_function, batched=True)

In [61]:
model_checkpoint

'Helsinki-NLP/opus-mt-en-fr'

In [62]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [63]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [64]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    push_to_hub=True
)

In [65]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [66]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=valid_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


training_args = Seq2SeqTrainingArguments(
    # output_dir="/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million",
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [25]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.1728,1.259459,34.1057,29.345




TrainOutput(global_step=1563, training_loss=1.1848399478963607, metrics={'train_runtime': 473.186, 'train_samples_per_second': 211.376, 'train_steps_per_second': 3.303, 'total_flos': 2380757380890624.0, 'train_loss': 1.1848399478963607, 'epoch': 1.0})

# OLD

In [21]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [22]:
tokenized_train_datasets = train_dataset.map(preprocess_function, batched=True)
tokenized_valid_datasets = valid_dataset.map(preprocess_function, batched=True)
tokenized_test_datasets = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/100020 [00:00<?, ? examples/s]

In [23]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, max_length=128)

In [24]:
import evaluate

metric = evaluate.load("sacrebleu")

In [25]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [26]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [27]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
# import transformers

# print(transformers.__version__)

In [44]:
import os
getpwd = os.getcwd()
model_output_name = "T5_wmt14_En_Fr_1million"
output_dir = os.path.join(getpwd, "checkpoints", model_output_name)

In [39]:
# os.path.join(getpwd + "/checkpoints/" + model_output_name)

'/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million'

In [45]:
output_dir

'/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million'

In [29]:
training_args = Seq2SeqTrainingArguments(
    # output_dir="/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million",
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("/workspace/2024/Adversarial_NMT_th/checkpoints/T5_wmt14_En_Fr_1million")

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.0796,1.187222,9.2959,18.0253
2,1.01,1.202858,9.1594,18.0187
3,0.9686,1.211359,9.2836,18.0123
4,0.9366,1.226113,9.18,17.995
5,0.8999,1.23188,9.2754,17.9793
6,0.8769,1.241295,9.1705,18.026
7,0.8536,1.250221,9.036,17.9987
8,0.8273,1.263332,9.2003,18.006
9,0.8125,1.274042,9.0991,18.009
10,0.7905,1.283455,8.9005,18.007




pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]