In [1]:
import pandas as pd
import torch
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

torch.cuda.empty_cache()

In [2]:
shake_df = pd.read_csv('data/shakespeare/train_minimal.csv')
shake_df['original_text_cleaned'] = shake_df.original_text_cleaned.str.strip()
shake_df['modern_text_cleaned'] = shake_df.modern_text_cleaned.str.strip()
tgt = shake_df.original_text_cleaned.sample()
tgt_str = tgt.values.astype(str)[0]
src = shake_df.modern_text_cleaned.iloc[tgt.index]
src_str = src.values.astype(str)[0]
print(f'source example: {src_str}')
print(f'target example: {tgt_str}')

source example: i ll listen to brutus
target example: i will hear brutus speak


In [3]:
# code to use for list of strings
shake_df.modern_text_cleaned.values.tolist()[:5]

['is that enough of an answer',
 'all right ursula as beatrice arrives we ll be walking up and down this alley and speaking about nothing but benedick',
 'no no i m as ugly as a bear since animals that see me run away in terror',
 'you agree now that we re not imagining this don t you',
 'okay but when']

In [4]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="en_XX")

#### Training?

In [5]:
shake_df.rename(columns={'original_text_cleaned': 'target', 'modern_text_cleaned':'source'}, inplace=True)
shake_df = shake_df[['source', 'target']]
shake_df.to_csv('data/shakespeare/train_minimal_renamed.csv', index=False)

In [6]:
shake_eval = pd.read_csv('data/shakespeare/dev.csv').drop(
    ['play_line_id', 'title', 'line', 'original_text', 'modern_text', 'combined_text_clean'], axis=1
).rename(columns={'original_text_cleaned': 'target', 'modern_text_cleaned':'source'})
shake_eval = shake_eval[['source', 'target']]
shake_eval.to_csv('data/shakespeare/dev_minimal_renamed.csv', index=False)

In [7]:
data_files = {'train': 'data/shakespeare/train_minimal_renamed.csv', 'eval': 'data/shakespeare/dev_minimal_renamed.csv'}
#test_dataset = load_dataset('csv', data_files={'test': 'data/shakespeare/test_minimal_renamed.csv'})
dataset = load_dataset("csv", data_files=data_files)
dataset

Using custom data configuration default-167304bfc1a3f1b0


Downloading and preparing dataset csv/default to /home/sryanlee/.cache/huggingface/datasets/csv/default-167304bfc1a3f1b0/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/sryanlee/.cache/huggingface/datasets/csv/default-167304bfc1a3f1b0/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 16863
    })
    eval: Dataset({
        features: ['source', 'target'],
        num_rows: 2108
    })
})

In [8]:
dataset['train']['source'][50]

'it s nice to see children playing well together'

#### Tokenizing Inputs and Targets

[FROM HUGGINGFACE TRANSLATION TUTORIAL](https://huggingface.co/course/chapter7/4?fw=tf#using-the-finetuned-model)

context manager as_target_tokenizer() will set the tokenizer in the output language (here, French) before the indented block is executed, then set it back in the input language (here, English).

So, preprocessing one sample looks like this:


`en_sentence = split_datasets["train"][1]["translation"]["en"]`

`fr_sentence = split_datasets["train"][1]["translation"]["fr"]`

`inputs = tokenizer(en_sentence)`
`with tokenizer.as_target_tokenizer():`
`    targets = tokenizer(fr_sentence)`

If we forget to tokenize the targets inside the context manager, they will be tokenized by the input tokenizer, which in the case of a Marian model is not going to go well at all:

`wrong_targets = tokenizer(fr_sentence)`
`print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))`
`print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))`

In [9]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    model_inputs = tokenizer(inputs, padding='max_length', max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
tokenized_datasets

  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16863
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2108
    })
})

In [11]:
tokenized_datasets['train']['input_ids'][0]
tokenized_datasets['train']['labels'][0]
tokenizer.decode(tokenized_datasets['train']['input_ids'][0])

'en_XX is that enough of an answer</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [19]:
from transformers import DataCollatorForSeq2Seq

# only need collator for tensorflow format? unncessary for pytorch format? 
# according to tutorial docs, need the collator so padding is -100 and gets ignored from loss computations
# according to PreTrainedTokenizer docs, any <pad> token is ignored from loss computations
# mbart50 tokenizer inherits from PreTrained Tokenizer

#data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [12]:
tokenized_datasets.set_format(
    type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [13]:
print(tokenized_datasets['train'][0])

{'input_ids': tensor([250004,     83,    450,  20174,    111,    142,  35166,      2,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,   

##### Metrics

One weakness with BLEU is that it expects the text to already be tokenized, which makes it difficult to compare scores between models that use different tokenizers. So instead, the most commonly used metric for benchmarking translation models today is SacreBLEU, which addresses this weakness (and others) by standardizing the tokenization step. To use this metric, we first need to install the SacreBLEU library:

In [14]:
metric = load_metric("bleu")
# metric = load_metric("sacrebleu")

def compute_metrics():
    all_preds = []
    all_labels = []
    sampled_dataset = tokenized_datasets["eval"].shuffle().select(range(100))
    sampled_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    dataloader = torch.utils.data.DataLoader(sampled_dataset, batch_size=8, shuffle=False)
    
    i = 1
    for batch in dataloader:
        #print(f"batch number: {i}")
        batch_output = next(iter(batch))
        predictions = model.generate(
                input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
            )
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = batch["labels"]
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.append(decoded_preds)
        all_labels.append(decoded_labels)

        result = metric.compute(predictions=all_preds, references=all_labels)
        i += 1

    return {"bleu": result["score"]}

In [15]:
training_args = TrainingArguments(
    output_dir = 'MBART_training',
    num_train_epochs = 3,
    evaluation_strategy = 'steps',
    eval_steps = 500,
    learning_rate=1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    seed =0,
    load_best_model_at_end = True
#     do_train = True,
#     do_eval = True,
#     logging_strategy = 'epoch',
#     metric_for_best_model = 'eval_loss',
#     warmup_steps = 250,
#     weight_decay = 0.01,
)

In [16]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    compute_metrics=compute_metrics,
    #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 16863
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6324
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms-ryanlee[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 2108
  Batch size = 8


RuntimeError: CUDA out of memory. Tried to allocate 2.86 GiB (GPU 0; 15.78 GiB total capacity; 11.98 GiB already allocated; 2.54 GiB free; 12.03 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model('en-shake-mbart50-model')

In [None]:
trainer.evaluate()

In [None]:
# outputs = trainer.predict(tokenized_test_dataset)
# pred = outputs.predictions