Install dependecies

In [1]:
!pip install transformers[torch] datasets==2.10.0 evaluate rouge_score -q
!pip install git+https://github.com/google-research/bleurt.git -q
!pip install bert_score -q
!pip install demoji -q

  Preparing metadata (setup.py) ... [?25l[?25hdone


Imports

In [2]:
from datasets import load_dataset
import json
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import torch
import demoji
import nltk

Load and preprocess data. Remove emojis, file embeddings and \r, \n.

In [13]:
path = "drive/MyDrive/NLP/PR/"
dataset = load_dataset("json", data_files={"train":[path+"samsum/train.json"], "val":[path+"samsum/val.json"], "test":[path+"samsum/test.json"]})



  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def preprocess(examples):
  processed = []
  out = {}
  # print(examples)
  for k, v in examples.items():
    if k!="id":
      for x in v:
        processed.append(demoji.replace(x.replace("\r","").replace("\n"," "), ''))
      out[k] = processed
      processed=[]
  return out

dataset = dataset.map(preprocess, batched=True)



Load model and tokenizer

In [5]:
# checkpoint = "sshleifer/distilbart-cnn-6-6"
checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

tokenize and create batches of data 

In [6]:
def batchify(examples):
  dialogues = [x for x in examples["dialogue"]]
  data = tokenizer(dialogues, max_length=256, truncation=True, padding="max_length")
  labels = tokenizer(text_target=examples["summary"], max_length=64, truncation=True, padding="max_length")
  data["labels"] = labels["input_ids"]
  data["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in labels["input_ids"]
    ]
  return data

In [7]:
dataset = dataset.map(batchify, batched=True, remove_columns=["id","dialogue","summary"])
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, pad_to_multiple_of=8)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Define evaluation metrics

In [8]:
rouge = evaluate.load("rouge")
bleu = evaluate.load('bleu')
bleurt = evaluate.load('bleurt', module_type='metric', checkpoint='bleurt-base-512')
bert = evaluate.load('bertscore', lang="en", idf=True)



Utility function to compute the desired metrics

In [9]:
nltk.download("punkt", quiet=True)

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(preds):
    predictions, labels = preds
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    print(decoded_preds, decoded_labels)

    rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    bleurt_score = bleurt.compute(predictions=decoded_preds, references=decoded_labels)
    bert_score = bert.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    result = {
        "gen_len": np.mean(prediction_lens),
        "rouge_score": rouge_score,
        "bleu_score": bleu_score,
        "bleurt_score":np.median(bleurt_score["scores"]),
        "bert_score":[np.mean(np.array(bert_score[k], dtype=np.float32)) for k in ["precision","recall","f1"]]
    }

    return result

Training loop

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
training_args =Seq2SeqTrainingArguments(
    output_dir="bart_sum_samsum",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    # eval_steps=1,
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_strategy="no",
    logging_strategy="epoch",
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    disable_tqdm=False,
    hub_strategy="end",
    push_to_hub_token="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"
)

model = model.to(device)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

trainer.train()

/content/bart_sum_samsum is already a clone of https://huggingface.co/sentientconch/bart_sum_samsum. Make sure you pull the latest changes with `repo.git_pull()`.
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Gen Len,Rouge Score,Bleu Score,Bleurt Score,Bert Score
1,1.9517,1.865338,59.837408,"{'rouge1': 0.38519198024299967, 'rouge2': 0.18637611248242514, 'rougeL': 0.29114807190727665, 'rougeLsum': 0.35950287045215523}","{'bleu': 0.10947202918144075, 'precisions': [0.2891732184886574, 0.1408997955010225, 0.07921257375593964, 0.04449898623412656], 'brevity_penalty': 1.0, 'length_ratio': 2.1406442622072146, 'translation_length': 39938, 'reference_length': 18657}",-0.557416,"[0.881794273853302, 0.914982795715332, 0.897921621799469]"
2,1.4162,2.167304,60.673594,"{'rouge1': 0.3824027985681461, 'rouge2': 0.17720440481192257, 'rougeL': 0.27951993033831063, 'rougeLsum': 0.3523751309023303}","{'bleu': 0.10292900287115767, 'precisions': [0.29144708090182264, 0.13358367689924108, 0.07251160668759896, 0.03975854026615448], 'brevity_penalty': 1.0, 'length_ratio': 2.084954708688428, 'translation_length': 38899, 'reference_length': 18657}",-0.756705,"[0.873441755771637, 0.9113098978996277, 0.8918185234069824]"
3,0.9763,1.885421,59.885086,"{'rouge1': 0.3925367542901428, 'rouge2': 0.19030742072418566, 'rougeL': 0.29557020575264703, 'rougeLsum': 0.36302164503856826}","{'bleu': 0.11050318220968344, 'precisions': [0.29364664926022627, 0.14059446150722135, 0.0786956634438425, 0.04589391170784672], 'brevity_penalty': 1.0, 'length_ratio': 2.1554912365332046, 'translation_length': 40215, 'reference_length': 18657}",-0.528027,"[0.880211353302002, 0.9188302755355835, 0.8989349007606506]"
4,0.5749,2.120929,59.831296,"{'rouge1': 0.39413787163188574, 'rouge2': 0.18797763014604468, 'rougeL': 0.29824353058090336, 'rougeLsum': 0.36387927887558746}","{'bleu': 0.10944201950995913, 'precisions': [0.2954957640803955, 0.1391474146019831, 0.07730156674867279, 0.045135857343175385], 'brevity_penalty': 1.0, 'length_ratio': 2.1574208072037306, 'translation_length': 40251, 'reference_length': 18657}",-0.507507,"[0.8815322518348694, 0.9193716049194336, 0.89988774061203]"
5,0.2765,2.531983,59.924205,"{'rouge1': 0.3935658688306535, 'rouge2': 0.18713851540657486, 'rougeL': 0.29574644161280017, 'rougeLsum': 0.3606436542704101}","{'bleu': 0.10800411600387674, 'precisions': [0.2944046763926386, 0.13710024017191252, 0.07618039600382064, 0.044252221841293286], 'brevity_penalty': 1.0, 'length_ratio': 2.163959907809401, 'translation_length': 40373, 'reference_length': 18657}",-0.499814,"[0.8805868625640869, 0.9189654588699341, 0.899208664894104]"


["A wants to get a puppy for her son.\nB is going to the animal shelter tomorrow.\nA dog is a fan of Motorhead's dead hamster.\nA took him to the shelter last Monday.\nHe wants to take it home right away.\nHe will name it Lemmy.", "Emma wants to buy a Christmas present for her kids.\nRob used to get one every year as a child.\nLauren wants to ask her sister if she can do something nice for someone else.\nShe is going to do something for the kids.\nEmma's sister will ask them questions about Christmas.", "Madison is pregnant, but she doesn't want to talk about it.\nJackie and Iggy are talking about the same person, Patricia Stevens.\nIggy thinks it's a coincidence that she's pregnant.\nThey're going to talk to her friend about it as well.", "Marla found a pair of her sister's underwear under her bed.\nThey are doing a total CSI investigation on one pair of Marla's old pair of boxers.\nTamara and Kiki think Marla left her underwear in her room as a dare.", "Robert is looking for a music 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Trainer is attempting to log a value of "{'rouge1': 0.38519198024299967, 'rouge2': 0.18637611248242514, 'rougeL': 0.29114807190727665, 'rougeLsum': 0.35950287045215523}" of type <class 'dict'> for key "eval/rouge_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.10947202918144075, 'precisions': [0.2891732184886574, 0.1408997955010225, 0.07921257375593964, 0.04449898623412656], 'brevity_penalty': 1.0, 'length_ratio': 2.1406442622072146, 'translation_length': 39938, 'reference_length': 18657}" of type <class 'dict'> for key "eval/bleu_score" as a scalar. This invocation of Tensorboard'

['A to get a puppy for her son.\nB will go to the animal shelter with her.\nA took him there last Monday.\nHe showed him one that he really liked.\nA will get one of the little dogs named Lemmy.\nB is a great fan of Motorhead.', "Em to buy an advent calendar for her kids.\nRob used to get one every year when he was a child.\nLauren's children get excited every time they get one.\nRob's sister adds notes for her children to do something nice for someone else.\nRob and Emma are going to spend Christmas together.", "Madison is pregnant, but she doesn't want to tell Iggy about it.\nIggy and Jackie think she's taking care of it seriously.\nggy's friend was pregnant once and she wasn't happy about it, so Iggy didn't expect her to do it again.", "Mar someone put Marla's sister's underwear under her bed when she was gone.\nTamara and Kiki are conducting a total CSI investigation on one pair of boxers that Marla just found.\nMarla will put them in the other one's room as a dare.", 'Fred about t

Trainer is attempting to log a value of "{'rouge1': 0.3824027985681461, 'rouge2': 0.17720440481192257, 'rougeL': 0.27951993033831063, 'rougeLsum': 0.3523751309023303}" of type <class 'dict'> for key "eval/rouge_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.10292900287115767, 'precisions': [0.29144708090182264, 0.13358367689924108, 0.07251160668759896, 0.03975854026615448], 'brevity_penalty': 1.0, 'length_ratio': 2.084954708688428, 'translation_length': 38899, 'reference_length': 18657}" of type <class 'dict'> for key "eval/bleu_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.873441755771637, 0.9113098978996277, 0.8918185234069824]" of type <class 'list'> for key "eval/bert_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrec

['A wants to get a puppy for her son.\nHe wants to name it after his dead hamster.\nB will go to the animal shelter with A.\nA will get him one of the dogs he likes.\nHe took him there last Monday and he liked one that he liked.', "Emma wants to get an advent calendar for her kids.\nRob used to get one every year as a child.\nLauren's sister adds notes asking her children to do something for someone else at Christmas.\nRob and Lauren agree that it prepares kids for Christmas.\nEmma agrees.", "Madison is pregnant.\nShe doesn't want to talk about it.\nJackie is worried about it, because people get excited about it and think it's a big deal.\nIggy's friend was pregnant too.\nJackie felt similar at Patricia Stevens's wedding.\nJackie and Iggy think the same thing happened.", "Marla found some underwear under her bed.\nShe suspects it's her sister's idea.\nShe doesn't have underwear that Marla can't find.\nThey're doing a total CSI investigation on one pair of boxers in her room as a dare.\

Trainer is attempting to log a value of "{'rouge1': 0.3925367542901428, 'rouge2': 0.19030742072418566, 'rougeL': 0.29557020575264703, 'rougeLsum': 0.36302164503856826}" of type <class 'dict'> for key "eval/rouge_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.11050318220968344, 'precisions': [0.29364664926022627, 0.14059446150722135, 0.0786956634438425, 0.04589391170784672], 'brevity_penalty': 1.0, 'length_ratio': 2.1554912365332046, 'translation_length': 40215, 'reference_length': 18657}" of type <class 'dict'> for key "eval/bleu_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.880211353302002, 0.9188302755355835, 0.8989349007606506]" of type <class 'list'> for key "eval/bert_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorre

["A wants to get a puppy for her son.\nB agrees to go to the animal shelter with her.\nA took him there last Monday and got a dog that he liked.\nHe wanted to take it home and name it Lemmy.\nHe's a great motorhead fan.", "Emma loves Rob's kids' advent calendar.\nRob used to get one every year as a child.\nLauren also likes Rob's sister who makes little notes for her children and asks them to do something nice for someone else.\nEmma wants to make sure her kids prepare themselves for Christmas.", "Madison is pregnant but she doesn't want to talk about it.\nJackie is preparing Iggy for the pregnancy.\nOnce Iggy's friend was pregnant and was not happy about it, but she thought it was all about a coincidence.\nJackie and Iggy are gossiping about the same person.", "Marla found a dirty pair of boxers under her bed.\nShe suspects they were probably from her sister.\nShe doesn't have underwear that isn't pink.\nShe's doing a total CSI investigation on one of her exes' boxers.\nTamara and Kik

Trainer is attempting to log a value of "{'rouge1': 0.39413787163188574, 'rouge2': 0.18797763014604468, 'rougeL': 0.29824353058090336, 'rougeLsum': 0.36387927887558746}" of type <class 'dict'> for key "eval/rouge_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.10944201950995913, 'precisions': [0.2954957640803955, 0.1391474146019831, 0.07730156674867279, 0.045135857343175385], 'brevity_penalty': 1.0, 'length_ratio': 2.1574208072037306, 'translation_length': 40251, 'reference_length': 18657}" of type <class 'dict'> for key "eval/bleu_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8815322518348694, 0.9193716049194336, 0.89988774061203]" of type <class 'list'> for key "eval/bert_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorre

['A wants to get a puppy for her son.\nB wants to go to the animal shelter with her.\nA took him there last Monday.\nHe liked the dog and wanted to take it home.\nB suggests that the dog should be named after his dead hamster.', 'Emma loves the advent calendar she wants to buy for her kids.\nRob used to get one every year as a child.\nLauren also likes to make little notes for her children who are expected to do some creative things for the Christmas tree, and asks their parents to do something for them.', "Madison is pregnant but she doesn't want to talk about it.\nJackie is preparing Iggy for this.\nOnce Iggy's friend was pregnant and she was disappointed.\nJackie felt similar way at Patricia Stevens wedding.\nIggy and Jackie think the same person would be the next mother.", "Marla found dirty underwear under her bed.\nKiki suspects it's a prank.\nMarla's sister, 13, put the underwear in her room as a dare.\nThe sister of Marla doesn't have underwear that isn't pink.\nThe police are 

Trainer is attempting to log a value of "{'rouge1': 0.3935658688306535, 'rouge2': 0.18713851540657486, 'rougeL': 0.29574644161280017, 'rougeLsum': 0.3606436542704101}" of type <class 'dict'> for key "eval/rouge_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'bleu': 0.10800411600387674, 'precisions': [0.2944046763926386, 0.13710024017191252, 0.07618039600382064, 0.044252221841293286], 'brevity_penalty': 1.0, 'length_ratio': 2.163959907809401, 'translation_length': 40373, 'reference_length': 18657}" of type <class 'dict'> for key "eval/bleu_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8805868625640869, 0.9189654588699341, 0.899208664894104]" of type <class 'list'> for key "eval/bert_score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrec

TrainOutput(global_step=4605, training_loss=1.0390993397864923, metrics={'train_runtime': 4225.2278, 'train_samples_per_second': 17.433, 'train_steps_per_second': 1.09, 'total_flos': 3.990723124789248e+16, 'train_loss': 1.0390993397864923, 'epoch': 5.0})

In [11]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/1.51G [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/4.06k [00:00<?, ?B/s]

Upload file runs/Jul30_13-43-33_47d8db8eba6a/events.out.tfevents.1690724620.47d8db8eba6a.6965.0:   0%|        …

Upload file runs/Jul30_13-24-42_47d8db8eba6a/events.out.tfevents.1690723492.47d8db8eba6a.1985.0:   0%|        …

Upload file runs/Jul30_13-52-37_47d8db8eba6a/events.out.tfevents.1690725163.47d8db8eba6a.9974.0:   0%|        …

To https://huggingface.co/sentientconch/bart_sum_samsum
   eda5929..007684c  main -> main

   eda5929..007684c  main -> main

To https://huggingface.co/sentientconch/bart_sum_samsum
   007684c..cf37456  main -> main

   007684c..cf37456  main -> main



'https://huggingface.co/sentientconch/bart_sum_samsum/commit/007684c95fde27b28285ec0e9b8d2f72526e68d2'

Example inference

In [14]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model.to('cpu'), tokenizer=tokenizer)
summarizer(dataset['test'][1]['dialogue'])

[{'summary_text': "Eric and Rob are watching a stand-up by a Russian comedian on youtube. They are poking fun at it and agree it's hilarious.    his performance as a comedian should be considered to be the reason he's on youtube, but Rob and Eric are sceptical about it."}]

In [15]:
dataset['test'][1]

{'id': '13729565',
 'summary': 'Eric and Rob are going to watch a stand-up on youtube.',
 'dialogue': "Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric: I know! And shows how Americans see Russian ;)\r\nRob: And it's really funny!\r\nEric: I know! I especially like the train part!\r\nRob: Hahaha! No one talks to the machine like that!\r\nEric: Is this his only stand-up?\r\nRob: Idk. I'll check.\r\nEric: Sure.\r\nRob: Turns out no! There are some of his stand-ups on youtube.\r\nEric: Gr8! I'll watch them now!\r\nRob: Me too!\r\nEric: MACHINE!\r\nRob: MACHINE!\r\nEric: TTYL?\r\nRob: Sure :)"}