In [1]:
!pip install --upgrade pyarrow>=21.0.0 "protobuf<6.0.0,>=3.20.3"
!pip install transformers evaluate transformers[torch]
!pip install -U datasets



## Full Fine-Tuning For Summarization

### Load Model & Tokenizer

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

### Load Dataset

In [4]:
!pip install py7zr #need to install for samsum dataset

from datasets import load_dataset

dataset = load_dataset("ingeniumacademy/samsum")

dataset



DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'sentences', 'sentence_id', 'dialog_id'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'sentences', 'sentence_id', 'dialog_id'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'sentences', 'sentence_id', 'dialog_id'],
        num_rows: 819
    })
})

In [6]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

print(sample)
print(label)

Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
A needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [7]:
def generate_summary(input, llm):
  input_prompt = f"""
                  Summarize the following conversation.

                  {input}

                  Summary:
                  """

  input_ids = tokenizer(sample, return_tensors='pt')
  tokenized_output = llm.generate(input_ids['input_ids'], min_length=30, max_length=200)
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = generate_summary(sample, llm=model)
print("Sample")
print(sample)
print("-------------------")
print("Model Generated Summary:")
print(output)
print("-------------------")
print("Label:")
print(label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Model Generated Summary:
Hannah: Hey, do you have Betty's number? Amanda: Lemme check. Hannah: Ask Larry. Amanda: He called her last time we were at the park together.
-------------------
Label:
A needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


### Prepare Our Dataset

In [8]:
def tokenize_inputs(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    tokenized_prompt = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    tokenized_summary = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt', max_length=512)

    example['input_ids'] = tokenized_prompt['input_ids']
    example['attention_mask'] = tokenized_prompt['attention_mask']
    example['labels'] = tokenized_summary['input_ids']

    return example

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

(148, 6)
(9, 6)
(9, 6)


In [9]:
len(tokenized_datasets['train']['labels'][0])

512

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bart-cnn-samsum-finetuned",  # local directory
    hub_model_id="rebirthmonkey/bart-cnn-samsum-finetuned",  # identifier on the Hub
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    auto_find_batch_size=True,
    eval_strategy='epoch',
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

  trainer = Trainer(


In [12]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6826,0.138158




TrainOutput(global_step=19, training_loss=0.4167239540501645, metrics={'train_runtime': 41.7406, 'train_samples_per_second': 3.546, 'train_steps_per_second': 0.455, 'total_flos': 160365740556288.0, 'train_loss': 0.4167239540501645, 'epoch': 1.0})

### Push our model to the hub

In [13]:
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...-samsum-finetuned/model.safetensors:   0%|          |  174kB / 1.63GB            

  ...-samsum-finetuned/training_args.bin:  24%|##4       | 1.31kB / 5.37kB            

CommitInfo(commit_url='https://huggingface.co/rebirthmonkey/bart-cnn-samsum-finetuned/commit/d5f8527b2e8255e5e44a04140a5859c31c71a710', commit_message='End of training', commit_description='', oid='d5f8527b2e8255e5e44a04140a5859c31c71a710', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rebirthmonkey/bart-cnn-samsum-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='rebirthmonkey/bart-cnn-samsum-finetuned'), pr_revision=None, pr_num=None)

### Re-Test Our Model

In [15]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("rebirthmonkey/bart-cnn-samsum-finetuned")

output = generate_summary(sample, llm=loaded_model)

print("Sample")
print(sample)
print("-------------------")
print("Generated Summary:")
print(output)
print("Label:")
print(label)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Generated Summary:
. A is looking for Betty's number. She asked Larry to text her. He called her last time they were at the park together.
Label:
A needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
