<a href="https://colab.research.google.com/github/rocabrera/language-uncertainty/blob/master/finetunning_custom_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet datasets transformers sentencepiece

In [2]:
!nvidia-smi

Thu Jul 14 16:53:42 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict
from transformers import (T5Tokenizer, 
                          T5ForConditionalGeneration,
                          Seq2SeqTrainer, 
                          Seq2SeqTrainingArguments)

In [4]:
# from google.colab import drive
# drive.mount('/content/drive/Introdução_ao_Aprendizado_Profundo/ProjetoV2/data')

# No futuro seria legal fazer um mount

In [5]:
def squad_tokenizer_prompt(tokenizer, sample: dict):
    """
    Importante notar que não podemos retornar tensores se quisermos fazer em batch!
    """
    
    if isinstance(sample["question"], list):
        encoding = tokenizer(
            ['question: ' + q for q in sample["question"]],
            ['context: ' + c for c in sample["context"]],
            max_length=396,
            padding="max_length",
            truncation="only_second", # Se nao me engano trunca somente o contexto .... Problematico dependendo de onde a resposta esta
            return_attention_mask=True,
            add_special_tokens=True,
        )
                
    else:
        encoding = tokenizer(
            'question: ' + sample["question"],
            'context: ' + sample["context"],
            max_length=396,
            padding="max_length",
            truncation="only_second", # Se nao me engano trunca somente o contexto .... Problematico dependendo de onde a resposta esta
            return_attention_mask=True,
            add_special_tokens=True,
        )
    
    return encoding


def squad_tokenizer_answer(tokenizer, sample: dict):

    "Talvez mudar depois"

    if isinstance(sample["answers"], list):
        
        """
        Utilizei o eval porque estava lendo o dicionario do csv como string
        """

        texts = [eval(s)["text"][0] for s in sample["answers"]] # Tokenizando a primeira resposta *
        answer_encoding = tokenizer(
            texts,
            max_length=32,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
                
    else:
        answer_encoding = tokenizer(
        sample["answers"]["text"][0],  # Tokenizando a primeira resposta *
        max_length=32,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )
    labels_ids = answer_encoding["input_ids"]
    # https://huggingface.co/docs/transformers/model_doc/t5
    labels_ids[labels_ids == tokenizer.pad_token_id] = -100
    encoding = {"label": labels_ids.tolist(), "first_answer": texts}

    return encoding

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# **Loading data**

In [7]:
# new_wiki = load_dataset("squadshifts", 'new_wiki')["test"]
# nyt = load_dataset("squadshifts", 'nyt')["test"]
# reddit = load_dataset("squadshifts", 'reddit')["test"]
# amazon = load_dataset("squadshifts", 'amazon')["test"]

# data = concatenate_datasets([new_wiki, nyt, reddit, amazon]).remove_columns(["id", "title"])

"""
Para fazer os batchs provavlemente vamos precisar usar: https://huggingface.co/docs/transformers/main_classes/data_collator
'Data collators are objects that will form a batch by using a list of dataset elements as input. 
 These elements are of the same type as the elements of train_dataset or eval_dataset.'
"""

data = load_dataset('csv', data_files='df_paraphrase.csv')

Using custom data configuration default-f5bb923064df9206
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-f5bb923064df9206/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/1 [00:00<?, ?it/s]

# **Load Model**

In [9]:
# One can use T5ForConditionalGeneration (or the Tensorflow/Flax variant), which includes the language modeling head on top of the decoder.
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


# **Preprocessing**

In [10]:
def split_dataset(data: Dataset):

  train_testvalid = data.train_test_split(test_size=0.15)
  # Split the 10% test + valid in half test, half valid
  test_valid =   train_testvalid['test'].train_test_split(test_size=0.5)
  # gather everyone if you want to have a single DatasetDict
  return DatasetDict({
      'train': train_testvalid['train'],
      'test': test_valid['test'],
      'valid': test_valid['train']})

new_dataset = split_dataset(data["train"]) ; new_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 216
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 20
    })
    valid: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 19
    })
})

In [11]:
def preprocessing(data: Dataset):

  return (
      data.map(lambda x: squad_tokenizer_prompt(tokenizer, x), batched=True, num_proc=8)
          .map(lambda x: squad_tokenizer_answer(tokenizer, x), batched=True, num_proc=8)
          .remove_columns("answers")
  )

train = preprocessing(new_dataset["train"])
valid = preprocessing(new_dataset["valid"])
test = preprocessing(new_dataset["test"])



          

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

           

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

          

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

          

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

           

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

# **Training Model**

In [12]:
# Ref1: https://qa.fastforwardlabs.com/pytorch/hugging%20face/wikipedia/bert/transformers/2020/05/19/Getting_Started_with_QA.html
# Ref2: https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html#:~:text=There%20are%20two%20dominant%20metrics,possible%20correct%20answers%20is%20computed.
def compute_exact_match(predict_text: str, label_text:str):
  return int(predict_text == label_text)


def compute_f1(predict_text: str, label_text:str):
    pred_tokens = predict_text.split()
    truth_tokens = label_text.split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

def compute_metrics(eval_preds):
  token_ids, labels_ids = eval_preds
  decoded_texts = tokenizer.batch_decode(
                token_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False)
  
  aux = labels_ids.copy()
  aux[aux == -100] = tokenizer.pad_token_id
  label_texts = tokenizer.batch_decode(
                aux,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False)
  
  f1, em = zip(*[(compute_f1(decoded_text, label_text),compute_exact_match(decoded_text, label_text))
                 for decoded_text, label_text in zip(decoded_texts, label_texts)])
  # bleu = sacrebleu.corpus_bleu(decoded_texts, label_texts)

  return {"EM": np.mean(em), "F1": np.mean(f1)}

In [13]:
batch_size = 16
# logging_steps = train.num_rows // batch_size
model_name = f"{MODEL_NAME}-finetuned-qa"
training_args = Seq2SeqTrainingArguments(output_dir=model_name,
                                  num_train_epochs=10,
                                  learning_rate=2e-5, # Existe boas praticas para esse número no caso especifico do T5
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=10,
                                  log_level="error",
                                  predict_with_generate=True)



"""
Obs: predict_with_generate é importante para não nos preocuparmos com o decode do output do T5

Quando usamos esse parâmetro setado para true o argumento eval_preds na funcao compute_metrics 
recebe os tokens já... 
"""

'\nObs: predict_with_generate é importante para não nos preocuparmos com o decode do output do T5\n\nQuando usamos esse parâmetro setado para true o argumento eval_preds na funcao compute_metrics \nrecebe os tokens já... \n'

In [14]:
trainer = Seq2SeqTrainer(model=model, 
                         args=training_args,
                         compute_metrics=compute_metrics,
                         train_dataset=train,
                         eval_dataset=valid,
                         tokenizer=tokenizer)

# # Aqui podemos provavelmente fazer algo similar para calcular nossa loss de forma custom
# # Ref: https://huggingface.co/docs/transformers/main_classes/trainer

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.get("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         # compute custom loss (suppose one has 3 labels with different weights)
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
#         loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss
# """

In [15]:
def evaluate(trainer: Seq2SeqTrainer, dataset: Dataset, device):

  ems = []
  f1s = []

  for sample in dataset:
    token_ids = trainer.model.generate(input_ids=torch.as_tensor(sample["input_ids"]).reshape(1,-1).to(device))

    predict_text = tokenizer.batch_decode(token_ids,
                                          skip_special_tokens=True,
                                          clean_up_tokenization_spaces=False)
    f1s.append(compute_f1(predict_text[0], sample["first_answer"]))
    ems.append(compute_exact_match(predict_text[0], sample["first_answer"]))
    
  return ems, f1s

In [16]:
ems, f1s = evaluate(trainer, test, device)
print("ANTES DO FINETUNNING")
print("Exact Match:")
print(np.mean(ems))
print("F1:")
print(np.mean(f1s))

ANTES DO FINETUNNING
Exact Match:
0.25
F1:
0.4947103653625393


In [17]:
trainer.train();



Epoch,Training Loss,Validation Loss,Em,F1
1,0.8838,0.811771,0.157895,0.465657
2,0.7302,0.703284,0.157895,0.516406
3,0.5203,0.630817,0.210526,0.53395
4,0.5183,0.581759,0.157895,0.557286
5,0.4891,0.540105,0.210526,0.57483
6,0.4046,0.511095,0.210526,0.578801
7,0.5024,0.48996,0.210526,0.578801
8,0.4107,0.475436,0.210526,0.578801
9,0.3201,0.468546,0.210526,0.578801
10,0.3912,0.465708,0.210526,0.578801


In [18]:
ems, f1s = evaluate(trainer, test, device)
print("DEPOIS DO FINETUNNING")
print("Exact Match:")
print(np.mean(ems))
print("F1:")
print(np.mean(f1s))

DEPOIS DO FINETUNNING
Exact Match:
0.4
F1:
0.7021489455255028
