Finetune with MBart 


Tryout from preprocessing custom dataset for huggingface!!!!

In [19]:
from datasets import load_dataset, concatenate_datasets
import transformers
from transformers import AutoModel, AutoTokenizer, pipeline, Trainer, TrainingArguments, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import evaluate


Dataload using Huggingface 

In [20]:
file_path = "/home/sumire/main/discourse_context_mt/data/BSD-master/"
data_files = {"train": f"{file_path}train.json", "validation": f"{file_path}dev.json", "test": f"{file_path}test.json"}
dataset = load_dataset("json", data_files=data_files)
dataset

Using custom data configuration default-7afdfa849fc27a17
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tag', 'title', 'original_language', 'conversation'],
        num_rows: 670
    })
    validation: Dataset({
        features: ['id', 'tag', 'title', 'original_language', 'conversation'],
        num_rows: 69
    })
    test: Dataset({
        features: ['id', 'tag', 'title', 'original_language', 'conversation'],
        num_rows: 69
    })
})

In [21]:
# define train inputs and targets
inputs = [sent['en_sentence'] for doc in dataset["train"]["conversation"] for sent in doc]
targets = [sent['ja_sentence'] for doc in dataset["train"]["conversation"] for sent in doc]

Preprocess Using Huggingface

In [22]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-jap"
#configuration = MBartConfig()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,  return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

max_length = 128

def preprocess_function(data): # data should be splitted into train / dev / test internally
    inputs = [sent['en_sentence'] for doc in data["conversation"] for sent in doc]
    targets = [sent['ja_sentence'] for doc in data["conversation"] for sent in doc]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

loading configuration file config.json from cache at /home/sumire/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-en-jap/snapshots/e2beebe7a6e44c1ba8751a68b98f38d10dfbd1db/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-jap",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      46275
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 46275,
  "decoder_vocab_size": 46276,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "id2label": {
  

In [23]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Loading cached processed dataset at /home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-0c0a6337b2e69d86.arrow
Loading cached processed dataset at /home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-c247db31d5483dbe.arrow
Loading cached processed dataset at /home/sumire/.cache/huggingface/datasets/json/default-7afdfa849fc27a17/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-72958d014d3a0ec5.arrow


In [24]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2051
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2120
    })
})

DataCollator

In [25]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt") # not tf

Define Batch using Datacollator
###To Chryssa, decoder_input_ids now exists!####

In [26]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

# 'decoder_input_ids' is missing

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [27]:
batch["labels"]

tensor([[    6,     1,    92, 10214, 18949,    88,  4196,  5991,     1,     0],
        [    6,  3862,   861, 15749,  2894, 10312,     1,     0,  -100,  -100]])

In [28]:
batch["decoder_input_ids"]

tensor([[46275,     6,     1,    92, 10214, 18949,    88,  4196,  5991,     1],
        [46275,     6,  3862,   861, 15749,  2894, 10312,     1,     0, 46275]])

In [29]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[6, 1, 92, 10214, 18949, 88, 4196, 5991, 1, 0]
[6, 3862, 861, 15749, 2894, 10312, 1, 0]


Fine-tune using Trainer

In [30]:
tokenized_datasets["validation"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2051
})

In [31]:
training_args = TrainingArguments(
    output_dir='./results',          
    logging_dir='./logs',            
    num_train_epochs=1, #3             
    per_device_train_batch_size=1, #16  
    per_device_eval_batch_size=1,  #64 
    warmup_steps=500,                
    weight_decay=0.01,
    report_to="all",
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_datasets["train"],        
    eval_dataset=tokenized_datasets["validation"],            
    data_collator=data_collator,
    tokenizer=tokenizer
)

PyTorch: setting up devices


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
trainer.train()

***** Running training *****
  Num examples = 20000
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 610851840


RuntimeError: CUDA out of memory. Tried to allocate 978.00 MiB (GPU 0; 11.91 GiB total capacity; 10.88 GiB already allocated; 318.94 MiB free; 11.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Training without using Trainer
##Doesn't work###

inputs = [sent['en_sentence'] for doc in data["train"]["conversation"] for sent in doc]
targets = [sent['ja_sentence'] for doc in data["train"]["conversation"] for sent in doc]

inputs = tokenizer(inputs, text_target=targets, return_tensors="pt", padding=True)
print (inputs)

model(**inputs)

{'input_ids': tensor([[  2673,    903,     83,  ...,      1,      1,      1],
        [  2646,   9351,     83,  ...,      1,      1,      1],
        [ 25689,    398,    237,  ...,      1,      1,      1],
        ...,
        [    87,     25,   1181,  ...,      1,      1,      1],
        [240095,    764,   1221,  ...,      1,      1,      1],
        [    87,  15673,    221,  ...,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[     6, 135855,     37,  ...,      1,      1,      1],
        [   572,   6808,    154,  ...,      1,      1,      1],
        [     6,  38566, 160517,  ...,      1,      1,      1],
        ...,
        [     6,  81766, 102535,  ...,      1,      1,      1],
        [     6,  47181,    281,  ...,      1,      1,      1],
       

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

BLEU metrics

In [None]:
import evaluate 


In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
metric

EvaluationModule(name: "sacrebleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'e