Finetune with MBart 


Tryout from preprocessing custom dataset for huggingface!!!!

In [25]:
from datasets import load_dataset, concatenate_datasets
import transformers
from transformers import MBartModel, MBartTokenizer, MBartConfig, pipeline, Trainer, TrainingArguments, MBartForConditionalGeneration, DataCollatorForSeq2Seq
import evaluate
import numpy as np


Dataload using Huggingface 

In [26]:
file_path = "/home/sumire/main/discourse_context_mt/data/BSD-master/"
data_files = {"train": f"{file_path}train.json", "validation": f"{file_path}dev.json", "test": f"{file_path}test.json"}
dataset = load_dataset("json", data_files=data_files)
dataset

Using custom data configuration default-7afdfa849fc27a17
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tag', 'title', 'original_language', 'conversation'],
        num_rows: 670
    })
    validation: Dataset({
        features: ['id', 'tag', 'title', 'original_language', 'conversation'],
        num_rows: 69
    })
    test: Dataset({
        features: ['id', 'tag', 'title', 'original_language', 'conversation'],
        num_rows: 69
    })
})

In [27]:
# define train inputs and targets
inputs = [sent['en_sentence'] for doc in dataset["train"]["conversation"] for sent in doc]
targets = [sent['ja_sentence'] for doc in dataset["train"]["conversation"] for sent in doc]

Preprocess Using Huggingface

In [28]:
model_checkpoint = "facebook/mbart-large-cc25"
configuration = MBartConfig()
tokenizer = MBartTokenizer.from_pretrained(model_checkpoint, src_lang="en_XX", tgt_lang="ja_XX")
model = MBartForConditionalGeneration(configuration).from_pretrained(model_checkpoint)

max_length = 128

def preprocess_function(data): # data should be splitted into train / dev / test internally
    inputs = [sent['en_sentence'] for doc in data["conversation"] for sent in doc]
    targets = [sent['ja_sentence'] for doc in data["conversation"] for sent in doc]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

loading file sentencepiece.bpe.model from cache at /home/sumire/.cache/huggingface/hub/models--facebook--mbart-large-cc25/snapshots/2df0e6dd8a0e7f6df056fe4d0d95941a04b64e4f/sentencepiece.bpe.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /home/sumire/.cache/huggingface/hub/models--facebook--mbart-large-cc25/snapshots/2df0e6dd8a0e7f6df056fe4d0d95941a04b64e4f/config.json
Model config MBartConfig {
  "_name_or_path": "facebook/mbart-large-cc25",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 

In [29]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Loading cached processed dataset at /home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-facd42d09ea03dc0.arrow
Loading cached processed dataset at /home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-072cc31a57c50689.arrow
Loading cached processed dataset at /home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-3ee830733a911a47.arrow


In [30]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2051
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2120
    })
})

DataCollator

In [31]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt") # not tf

Define Batch using Datacollator
###To Chryssa, decoder_input_ids now exists!####

In [32]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

# 'decoder_input_ids' is missing

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [33]:
batch["labels"]

tensor([[   572,   6808,    154,   1395,   2128,    610,  34606,   5182,     30,
              2, 250012],
        [     6,  38566, 160517,  14362,  55826,     30,      2, 250012,   -100,
           -100,   -100]])

In [34]:
batch["decoder_input_ids"]

tensor([[250012,    572,   6808,    154,   1395,   2128,    610,  34606,   5182,
             30,      2],
        [250012,      6,  38566, 160517,  14362,  55826,     30,      2, 250012,
              1,      1]])

In [35]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[572, 6808, 154, 1395, 2128, 610, 34606, 5182, 30, 2, 250012]
[6, 38566, 160517, 14362, 55826, 30, 2, 250012]


Fine-tune using Trainer

In [36]:
tokenized_datasets["validation"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2051
})

In [37]:
CUDA_LAUNCH_BLOCKING=1

In [38]:
training_args = TrainingArguments(
    output_dir='./results',          
    logging_dir='./logs',            
        num_train_epochs=1, #3             
    per_device_train_batch_size=1, #16  
    per_device_eval_batch_size=1,  #64 
    warmup_steps=500,                
    weight_decay=0.01,
    report_to="all",
    gradient_accumulation_steps=1000,
    half_precision_backend="apex"
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_datasets["train"],        
    eval_dataset=tokenized_datasets["validation"],            
    data_collator=data_collator,
    tokenizer=tokenizer
)

PyTorch: setting up devices


In [39]:
from torch import nn

def freeze_params(model: nn.Module):
    """Set requires_grad=False for each of model.parameters()"""
    for par in model.parameters():
        par.requires_grad = False

#model = AutoModel.from_pretrained("facebook/bart-large")
enc_layers = model.get_encoder().layers
freeze_params(enc_layers)  # freeze layer 0
dropout = enc_layers[0].dropout   # return dropout value for layer 0
enc_layers[0].dropout = 0.5  # set dropout value for layer 0

In [40]:
trainer.train()

***** Running training *****
  Num examples = 20000
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4000
  Gradient Accumulation steps = 1000
  Total optimization steps = 5
  Number of trainable parameters = 459697152


RuntimeError: CUDA out of memory. Tried to allocate 26.00 MiB (GPU 0; 11.91 GiB total capacity; 11.26 GiB already allocated; 26.94 MiB free; 11.30 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Training without using Trainer
##Doesn't work###

inputs = [sent['en_sentence'] for doc in dataset["train"]["conversation"] for sent in doc]
targets = [sent['ja_sentence'] for doc in dataset["train"]["conversation"] for sent in doc]

inputs = tokenizer(inputs, text_target=targets, return_tensors="pt", padding=True)
print (inputs)

model(**inputs)

{'input_ids': tensor([[  2673,    903,     83,  ...,      1,      1,      1],
        [  2646,   9351,     83,  ...,      1,      1,      1],
        [ 25689,    398,    237,  ...,      1,      1,      1],
        ...,
        [    87,     25,   1181,  ...,      1,      1,      1],
        [240095,    764,   1221,  ...,      1,      1,      1],
        [    87,  15673,    221,  ...,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[     6, 135855,     37,  ...,      1,      1,      1],
        [   572,   6808,    154,  ...,      1,      1,      1],
        [     6,  38566, 160517,  ...,      1,      1,      1],
        ...,
        [     6,  81766, 102535,  ...,      1,      1,      1],
        [     6,  47181,    281,  ...,      1,      1,      1],
       

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

BLEU metrics

In [None]:
import evaluate 


In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
metric

EvaluationModule(name: "sacrebleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'e

In [None]:
def postprocess_text(preds, labels):
    preds = [preds.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, lables, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds)