Modelli : BART, T5, Marian, mBART 

In [None]:
!nvidia-smi
!export HF_TOKEN='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'
!git clone "https://github.com/IBM/KPA_2021_shared_task"

!pip install datasets -q
!pip install transformers -q
!pip install sentencepiece -q
!pip install rouge_score -q
!pip install transformers[torch]

In [None]:
!unzip libs.zip

In [None]:
from libs.generate.kpa_functions import load_kpm_data
import logging

logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)

dataset_directory = "KPA_2021_shared_task/kpm_data"  # directory for dataset used for training and validation set
testset_directory = "KPA_2021_shared_task/test_data" # directory for dataset used for testing set

tr_data, _, _, _ = load_kpm_data(gold_data_dir = dataset_directory, subset = "train")
vl_data, _, _, _ = load_kpm_data(gold_data_dir = dataset_directory, subset = "dev")
ts_data, _, _, _ = load_kpm_data(gold_data_dir = testset_directory, subset="test")
logging.debug({"tr_shape":tr_data.shape,"vl_shape":vl_data.shape,"ts_shape":ts_data.shape})


In [None]:
from datasets import Dataset

tr_dataset = Dataset(tr_data.to_arrow())
vl_dataset = Dataset(vl_data.to_arrow())
ts_dataset = Dataset(ts_data.to_arrow())

data = {'train': tr_dataset, 'validation': vl_dataset, 'test': ts_dataset}
logging.debug(data)


In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter
import numpy as np

def compute_rouge_n(reference, hypothesis, n):
    reference_tokens = nltk.word_tokenize(reference.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    reference_ngrams = list(ngrams(reference_tokens, n))
    hypothesis_ngrams = list(ngrams(hypothesis_tokens, n))

    reference_ngram_counter = Counter(reference_ngrams)
    hypothesis_ngram_counter = Counter(hypothesis_ngrams)

    overlap_count = sum((reference_ngram_counter & hypothesis_ngram_counter).values())
    reference_count = sum(reference_ngram_counter.values())
    
    if reference_count == 0:
        return 0.0
    
    rouge_n_recall = overlap_count / reference_count

    return rouge_n_recall

def compute_rouge_l(reference_tokens, hypothesis_tokens):
    reference_tokens = nltk.word_tokenize(reference.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    reference_length = len(reference_tokens)
    hypothesis_length = len(hypothesis_tokens)

    reference_set = set(reference_tokens)
    hypothesis_set = set(hypothesis_tokens)

    overlap_count = len(reference_set.intersection(hypothesis_set))
    
    if reference_length == 0:
        return 0.0
    
    rouge_l_recall = overlap_count / reference_length

    return rouge_l_recall

def compute_rouge(reference, hypothesis, n=1):
    if n == 1:
        return compute_rouge_l(reference, hypothesis)
    else:
        return compute_rouge_n(reference, hypothesis, n)

def compute_rouge_scores(reference, hypothesis, max_n=4):
    rouge_scores = {}
    for n in range(1, max_n + 1):
        rouge_scores[f'ROUGE-{n}'] = compute_rouge(reference, hypothesis, n)
    return rouge_scores

In [24]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM
import nltk
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('google/mt5-small',use_fast = False)       # import pre-trained MT5 tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-small')    # import pre-trained MT5 model
nltk.download('punkt')

def preprocess_function(data_set,max_input_length=300,max_target_length=60,padding="max_length"):

    inputs = data_set['argument']   # get input column
    targets = data_set['keypoint']  # get target column

    # add useful prefix to input, to tell the model which task has to perform
    #prefix = "summarize: "
    #inputs = [prefix + inp for inp in inputs]

    # execute input tokenization
    model_inputs = tokenizer(inputs,
                             max_length = max_input_length,
                             padding = padding,
                             truncation = True)

    # execute target tokenizatiion
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets,
                           max_length = max_target_length,
                           padding = padding,
                           truncation =True)
    labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def postprocess_text(preds, labels):

    # get predictions and labels and split them in different sentence
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # post-processing: ROUGE expects a newline after each sentence
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    rouge_scores = []

    # Compute ROUGE scores for each pair of generated and reference summaries
    for pred, label in zip(decoded_preds, decoded_labels):
        rouge_scores.append(compute_rouge_scores(label, pred))

    # Combine ROUGE scores from all pairs
    avg_rouge_scores = {}
    for rouge_score in rouge_scores:
        for metric, score in rouge_score.items():
            avg_rouge_scores.setdefault(metric, []).append(score)

    # Compute the average ROUGE scores
    for metric, scores in avg_rouge_scores.items():
        avg_rouge_scores[metric] = np.mean(scores)

    # Extract the median lengths of generated summaries
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    avg_rouge_scores["gen_len"] = np.mean(prediction_lens)

    # Round the scores for better readability
    avg_rouge_scores = {k: round(v, 4) for k, v in avg_rouge_scores.items()}
    
    return avg_rouge_scores



All model checkpoint weights were used when initializing MT5ForConditionalGeneration.

All the weights of MT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MT5ForConditionalGeneration for predictions without further training.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/riccardoamadio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
from transformers import default_data_collator,Seq2SeqTrainingArguments,Seq2SeqTrainer
import torch
# apply preprocessing procedure on TR, VL e TS set
train_dataset = tr_dataset.map(preprocess_function, batched=True)
eval_dataset = vl_dataset.map(preprocess_function, batched=True)
test_dataset = ts_dataset.map(preprocess_function,batched=True)

#mps_device = torch.device("mps")
#model.to(mps_device)
# define datacollators objects to use for creating batches
data_collator = default_data_collator
max_target_length= 60

# define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir    = 'output/',
    learning_rate = 1e-5,
    evaluation_strategy = "epoch",
    num_train_epochs    = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 16,
    warmup_steps = 500,
    weight_decay = 0.01,
    predict_with_generate = True
)

# initialize Trainer object
trainer = Seq2SeqTrainer(
    model = model,
    args  = training_args,
    train_dataset = train_dataset,
    eval_dataset  = eval_dataset,
    tokenizer     = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)


model.cuda()     # pass model to GPU
checkpoint = ''   # define checkpoint

# runs fine-tuning and save fine-tuned model
train_result = trainer.train(resume_from_checkpoint = None)
trainer.save_model()

# use model to predict new summary on test set
test_results = trainer.predict(
      test_dataset = test_dataset,
      metric_key_prefix = "test",
      max_length = max_target_length,
      num_beams = 6)
print(test_results)

  0%|          | 0/25 [00:00<?, ?ba/s]



  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: topic, arg_id, stance, arg_topic, argument, key_point_id, keypoint. If topic, arg_id, stance, arg_topic, argument, key_point_id, keypoint are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 24454
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4587
  Number of trainable parameters = 300176768


  0%|          | 0/4587 [00:00<?, ?it/s]

KeyboardInterrupt: 