In [None]:
!pip install datasets evaluate transformers rouge-score nltk

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
! apt install git-lfs

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers.utils import send_example_telemetry
send_example_telemetry('summarization_notebook', framework = 'pytorch')

In [None]:
model_checkpoint = "t5-small"

#### Loading the dataset

In [None]:
from datasets import load_dataset
from evaluate import load

raw_datasets = load_dataset("xsum")
metric = load("rouge")

print(raw_datasets)

In [None]:
# Acess to an entry point of the training set
raw_datasets["train"][0]

In [None]:
raw_datasets["train"][10]

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples = 5):
  assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset"
  picks = []

  for _ in range(num_examples):
    pick = random.randint(0, len(dataset) -1 )
    # If the randomly generated pick is already in the picks list (i.e., it’s a duplicate),
    # it generates a new random number until it finds one that hasn’t been picked yet
    while pick in picks:
      pick = random.randint(0, len(dataset) -1)
    picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    # Iterate through each column and its data type in the dataset's features
    for column, typ in dataset.features.items():
       # Check if the column type is a 'ClassLabel' (i.e., a column with integer labels mapped to class names)
      if isinstance(typ, datasets.ClassLabel):
         # Transform the column by mapping each integer label to its corresponding class name
         # 'typ.names' contains the list of class names, so 'i' is used to map the integer index to the class name
        df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [None]:
show_random_elements(raw_datasets["train"])

In [None]:
metric

In [None]:
# Test rouge metric
fake_preds = ["hello there", "general kenobi"]
fake_labels = ["hello there", "general kenobi"]
metric.compute(predictions=fake_preds, references=fake_labels)

## Preprocesing the data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# testing the tokenizer
tokenizer("Hello this is one sentence")

In [None]:
tokenizer(["Hello, this one sentence!", "This is another sentence"])

In [None]:
print(tokenizer(text_target= ["Hello, this one sentence!", "This is another sentence"]))

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["document"]]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  labels = tokenizer(text_target = examples['summary'], max_length = max_target_length, truncation= True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs


In [None]:
preprocess_function(raw_datasets['train'][:2])

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched= True)

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 16
model_name = model_checkpoint.split('/')[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to=None

)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
  # Eval_pred is a tuple: eval_pred = (predictions, labels)
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_characters= True)
  # Replace -100 in the labels that correspond to pad tokens. for evaluation we need to convert them to the pad_token_id of the tokenizer
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_characters= True)

  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(pred.strip())) for label in decoded_labels]

  result = metric.compute(predictions=decoded_preds, references = decoded_labels, use_stemmer= True, use_aggregator = True)
  # Extract a few results
  result = {key: value * 100 for key, value in result.items()}
  # A list of the lengths of the generated predictions (ignoring padding tokens)
  prediction_lens = [np.count_nonzero(pred !=  tokenizer.pad_token_id) for pred in predictions ]
  # Computes the average length of all predictions
  result['gen_lens'] = np.mean(prediction_lens)

  # Formats the final result by rounding all metric values to 4 decimal places
  return {k: round(v, 4) for k, v in result.items()}


The final output of the compute metrics function will be something like:

{
  "rouge1": 45.6789,
  "rouge2": 30.1234,
  "rougeL": 50.5678,
  "gen_lens": 25.4
}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub('diosilva/my-summarization-model')