In [1]:
!pip install transformers evaluate transformers[torch]
!pip install py7zr #need to install for samsum dataset
!pip install -U datasets
!pip install peft



### Load In Model & Dataset

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("rebirthmonkey/bart-cnn-samsum-finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("rebirthmonkey/bart-cnn-samsum-finetuned")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/278 [00:00<?, ?B/s]



In [3]:
from datasets import load_dataset

dataset = load_dataset("ingeniumacademy/samsum")

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'sentences', 'sentence_id', 'dialog_id'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'sentences', 'sentence_id', 'dialog_id'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'sentences', 'sentence_id', 'dialog_id'],
        num_rows: 819
    })
})

### Prepare Dataset

In [5]:
def tokenize_inputs(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    tokenized_prompt = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    tokenized_summary = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt', max_length=512)

    example['input_ids'] = tokenized_prompt['input_ids']
    example['attention_mask'] = tokenized_prompt['attention_mask']
    example['labels'] = tokenized_summary['input_ids']

    return example

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

(148, 6)
(9, 6)
(9, 6)


### Create PEFT Model using LoRA

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # 8, 16, 32
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(model, peft_config=lora_config)

In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Train PEFT Model

In [14]:
from transformers import TrainingArguments, Trainer

peft_training_args = TrainingArguments(
    output_dir="./bart-cnn-samsum-peft",  # local directory
    hub_model_id="rebirthmonkey/bart-cnn-samsum-peft",  # identifier on the Hub
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    auto_find_batch_size=True,
    eval_strategy='epoch',
    logging_steps=10,
    report_to="none"
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

peft_model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 411,009,024 || trainable%: 1.1481


In [15]:
peft_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1164,0.135857


Epoch,Training Loss,Validation Loss
1,0.0899,0.135808
2,0.1063,0.135637
3,0.1088,0.135395
4,0.0838,0.135256
5,0.0971,0.135248




TrainOutput(global_step=185, training_loss=0.09916154081757005, metrics={'train_runtime': 98.6022, 'train_samples_per_second': 7.505, 'train_steps_per_second': 1.876, 'total_flos': 818045622681600.0, 'train_loss': 0.09916154081757005, 'epoch': 5.0})

### Save PEFT Adapter

In [16]:
peft_trainer.push_to_hub()



Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...msum-peft/adapter_model.safetensors:   3%|2         |  561kB / 18.9MB            

  ...t-cnn-samsum-peft/training_args.bin:   3%|2         |   159B / 5.37kB            

CommitInfo(commit_url='https://huggingface.co/rebirthmonkey/bart-cnn-samsum-peft/commit/45cf092aa2182682ef460d4e440462e8d61cd9ab', commit_message='End of training', commit_description='', oid='45cf092aa2182682ef460d4e440462e8d61cd9ab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rebirthmonkey/bart-cnn-samsum-peft', endpoint='https://huggingface.co', repo_type='model', repo_id='rebirthmonkey/bart-cnn-samsum-peft'), pr_revision=None, pr_num=None)

In [17]:
def generate_summary(input, llm):
  input_prompt = f"""
                  Summarize the following conversation.

                  {sample}

                  Summary:
                  """

  input_ids = tokenizer(sample, return_tensors='pt')
  tokenized_output = llm.generate(input_ids=input_ids['input_ids'], min_length=30, max_length=200)
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

### Reload & Test

In [18]:
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("rebirthmonkey/bart-cnn-samsum-finetuned")
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("rebirthmonkey/bart-cnn-samsum-finetuned")

loaded_peft_model = PeftModel.from_pretrained(
    peft_model_base,
    "rebirthmonkey/bart-cnn-samsum-peft",
    is_trainable=False
)

adapter_config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

In [19]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

output = generate_summary(sample, llm=loaded_peft_model)

print("Sample")
print(sample)
print("-------------------")
print("Generated Summary:")
print(output)
print("-------------------")
print("Label:")
print(label)



Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Generated Summary:
. A is looking for Betty's number. A is going to text Larry. A is going to text Larry. A is going to text Larry.
-------------------
Label:
A needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
