In [None]:
!pip install transformers datasets evaluate transformers[torch]

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

## Full Fine-Tuning For Summarization

### Load Model & Tokenizer

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

"""
BART HAS 400M PARAMS: https://github.com/facebookresearch/fairseq/tree/main/examples/bart
"""

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

### Load Dataset

In [None]:
!pip install py7zr #need to install for samsum dataset



In [None]:
from datasets import load_dataset

dataset = load_dataset("samsum")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

def generate_summary(input, llm):
  input_prompt = f"""
                  Summarize the following conversation.

                  {input}

                  Summary:
                  """

  input_ids = tokenizer(sample, return_tensors='pt')
  tokenized_output = llm.generate(input_ids['input_ids'], min_length=30, max_length=200)
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = generate_summary(sample, llm=model)
print("Sample")
print(sample)
print("-------------------")
print("Model Generated Summary:")
print(output)
print("Correct Summary:")
print(label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Model Generated Summary:
Hannah: Hey, do you have Betty's number? Amanda: Lemme check. Hannah: Ask Larry. Amanda: He called her last time we were at the park together.
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


### Prepare Our Dataset

In [None]:
def tokenize_inputs(example):
  start_prompt = "Summarize the following conversation.\n\n"
  end_prompt = "\n\nSummary: "
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
  example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt').input_ids
  example['labels'] = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt').input_ids

  return example

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

(148, 2)
(9, 2)
(9, 2)


In [None]:
tokenized_datasets['train'][0].keys()

dict_keys(['input_ids', 'labels'])

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bart-cnn-samsum-finetuned",  # local directory
    hub_model_id="ingeniumacademy/bart-cnn-samsum-finetuned",  # identifier on the Hub
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    auto_find_batch_size=True,
    evaluation_strategy='epoch',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.113,0.249339


TrainOutput(global_step=148, training_loss=0.10309620966782441, metrics={'train_runtime': 178.9197, 'train_samples_per_second': 0.827, 'train_steps_per_second': 0.827, 'total_flos': 320731481112576.0, 'train_loss': 0.10309620966782441, 'epoch': 1.0})

### Push our model to the hub

In [None]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

'https://huggingface.co/ingeniumacademy/bart-cnn-samsum-finetuned/tree/main/'

### Re-Test Our Model

In [None]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("ingeniumacademy/bart-cnn-samsum-finetuned")

output = generate_summary(sample, llm=loaded_model)

print("Sample")
print(sample)
print("-------------------")
print("Summary:")
print(output)
print("Ground Truth Summary:")
print(label)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Summary:
Hannah asks Amanda if she has Betty's number. Amanda can't find it, so she asks Larry to call her. Amanda is going to text Larry to ask him to call Betty.
Ground Truth Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
