In [2]:
%pip install --upgrade pip
%pip install torch torchdata

%pip install transformers datasets --quiet

Note: you may need to restart the kernel to use updated packages.
Collecting torchdata
  Downloading torchdata-0.8.0-cp311-cp311-manylinux1_x86_64.whl.metadata (5.4 kB)
Downloading torchdata-0.8.0-cp311-cp311-manylinux1_x86_64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchdata
Successfully installed torchdata-0.8.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install -U datasets

Collecting datasets
  Using cached datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-3.0.0-py3-none-any.whl (474 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.11.0
    Uninstalling datasets-2.11.0:
      Successfully uninstalled datasets-2.11.0
Successfully installed datasets-3.0.0
Note: you may need to restart the kernel to use updated packages.


In [27]:
%pip install evaluate rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting rouge_score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting nltk (from rouge_score)
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk->rouge_score)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk->rouge_score)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.4 MB/s[

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!nvidia-smi

Mon Sep 23 13:50:04 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.27                 Driver Version: 560.70         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:01:00.0  On |                  Off |
|  0%   38C    P8             34W /  450W |     733MiB /  24564MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Load Model & Data

In [3]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
model_name='google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
original_model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

  return torch.load(checkpoint_file, map_location="cpu")


In [7]:
def trainable_model_parameters_summary(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return {"trainable_paramater_count": trainable_model_params, "parameter_count": all_model_params, "percent_trainable": trainable_model_params/all_model_params*100}

print(trainable_model_parameters_summary(original_model))

{'trainable_paramater_count': 247577856, 'parameter_count': 247577856, 'percent_trainable': 100.0}


## Zero Shot

In [46]:
example_indices = [0, 1]
dash_line = '-'.join('' for x in range(100))

def run_inference(model, example):
    dialogue = example['dialogue']
    summary = example['summary']

    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
    """
     # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            generation_config=GenerationConfig(max_new_tokens=200, num_beams=1)
        )[0],
        skip_special_tokens=True
    )
    return output
    

for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    output = run_inference(original_model, dataset['test'][index])

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Pers

## Full fine tuning

In [10]:
def build_prompt(example):
  start_prompt = 'Summarize the following conversation.\n\n'
  end_prompt = '\n\nSummary: '
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
  return prompt

def tokenize_function(example):
    prompt = build_prompt(example)
    mapped = {}
    mapped['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    mapped['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return mapped

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map: 100%|█████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 2914.09 examples/s]


In [11]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

Shapes of the datasets:
Training: (12460, 2)
Validation: (500, 2)
Test: (1500, 2)


In [30]:
output_dir = f'./logs/dialogue-summary-training'

training_args = TrainingArguments(
    output_dir= output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=200
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

trainer.train()

Step,Training Loss
1,38.2971
2,38.844
3,36.8322
4,36.7103
5,35.763
6,35.3485
7,34.6134
8,33.6804
9,31.8146
10,32.2093


TrainOutput(global_step=200, training_loss=9.482090774774552, metrics={'train_runtime': 517.4064, 'train_samples_per_second': 3.092, 'train_steps_per_second': 0.387, 'total_flos': 1095611763916800.0, 'train_loss': 9.482090774774552, 'epoch': 0.13})

In [31]:
instruct_model_name='truocpham/flan-dialogue-summary-checkpoint'
instruct_model = AutoModelForSeq2SeqLM.from_pretrained( instruct_model_name, torch_dtype=torch.bfloat16).to('cuda')

## Evaluate

### Examples

In [19]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person2#: I'm not sure what I'm doing, but I'm not sure what exactly I would need. #Person2#: I'm thinking about upgrading my system. #Person2#: I'm not sure what exactly I would need. #Person2#: I'd also like to add a computer program. #Person1#: I'm not sure what exactly I would need. I'd probably need a computer with a computer program.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.


### Metrics

In [21]:
import evaluate
rouge = evaluate.load('rouge')

In [28]:
human_baseline_summaries = dataset['test']['summary']

instruct_model_summaries = []
for example in dataset['test']:
    instruct_model_summaries.append(run_inference(instruct_model, example))                                    

Token indices sequence length is longer than the specified maximum sequence length for this model (1028 > 512). Running this sequence through the model will result in indexing errors


In [32]:
original_model_summaries = []

for example in dataset['test']:
    original_model_summaries.append(run_inference(original_model, example))  

In [33]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.18307571781465698), 'rouge2': np.float64(0.052240285729029785), 'rougeL': np.float64(0.1573074359870929), 'rougeLsum': np.float64(0.15727591039961691)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.4244340283204921), 'rouge2': np.float64(0.18100921534077902), 'rougeL': np.float64(0.34008803784015185), 'rougeLsum': np.float64(0.33988949139041635)}


In [35]:
full_model_path="./full-dialogue-summary-checkpoint-local"

original_model.save_pretrained(full_model_path)
tokenizer.save_pretrained(full_model_path)


('./full-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './full-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './full-dialogue-summary-checkpoint-local/tokenizer.json')

# PEFT / Lora

In [37]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [38]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(trainable_model_parameters_summary(peft_model))

{'trainable_paramater_count': 3538944, 'parameter_count': 251116800, 'percent_trainable': 1.4092820552029972}


In [40]:
output_dir = f'./logs/peft-dialogue-summary-training'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=100    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [41]:
peft_trainer.train()



Step,Training Loss
1,3.3973
2,3.384
3,3.3324
4,5.9199
5,3.1067
6,5.5331
7,5.3021
8,4.8309
9,2.673
10,2.8062


TrainOutput(global_step=100, training_loss=1.0176673962175846, metrics={'train_runtime': 32.1206, 'train_samples_per_second': 24.906, 'train_steps_per_second': 3.113, 'total_flos': 556503190732800.0, 'train_loss': 1.0176673962175846, 'epoch': 0.06})

### Evaluate

In [51]:
#peft_model = peft_model.merge_and_unload() 

peft_model_name='truocpham/peft-dialogue-summary-checkpoint'
peft_model = AutoModelForSeq2SeqLM.from_pretrained( instruct_model_name, torch_dtype=torch.bfloat16).to('cuda')

  return torch.load(checkpoint_file, map_location="cpu")


In [52]:
index = 200
baseline_human_summary = dataset['test'][index]['summary']

original_model_text_output = run_inference(original_model, dataset['test'][index])
instruct_model_text_output = run_inference(instruct_model, dataset['test'][index])
peft_model_text_output = run_inference(peft_model, dataset['test'][index])

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person2#
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.
---------------------------------------------------------------------------------------------------
PEFT MODEL: #Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.


### Metrics

In [53]:
peft_model_summaries = []

for example in dataset['test']:
    peft_model_summaries.append(run_inference(peft_model, example))  

In [54]:
peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.18307571781465698), 'rouge2': np.float64(0.052240285729029785), 'rougeL': np.float64(0.1573074359870929), 'rougeLsum': np.float64(0.15727591039961691)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.4244340283204921), 'rouge2': np.float64(0.18100921534077902), 'rougeL': np.float64(0.34008803784015185), 'rougeLsum': np.float64(0.33988949139041635)}
PEFT MODEL:
{'rouge1': np.float64(0.4244340283204921), 'rouge2': np.float64(0.18100921534077902), 'rougeL': np.float64(0.34008803784015185), 'rougeLsum': np.float64(0.33988949139041635)}
