In [None]:
!pip install transformers qwen

In [None]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [3]:
from datasets import load_dataset
from transformers import Trainer, TrainingArguments

huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

In [4]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
# import evaluate
import pandas as pd
import numpy as np

In [36]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# model_name = "microsoft/Phi-3-mini-4k-instruct"
# model_name = "Qwen/Qwen2.5-0.5B"
model_name = "Qwen/Qwen2.5-0.5B-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = False)
original_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code = False, torch_dtype=torch.bfloat16)

print(original_model.config.torch_dtype)
print(original_model.dtype)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)

print("cuda available:", torch.cuda.is_available())
print("device:", device)

print(original_model.name_or_path)

torch.bfloat16
torch.bfloat16
cuda available: True
device: cuda
Qwen/Qwen2.5-0.5B-instruct


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)

In [None]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)

In [None]:
print(tokenizer("hello there", padding = "max_length", truncation = True, max_length = 1024))

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

In [44]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt').to(device)
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [None]:
print(dataset["train"])

In [37]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

    # If use flan-t5, works fine. But if use Qwen0.5B, out of memory when training
    # example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    # example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    # example.remove_columns(['id', 'topic', 'dialogue', 'summary',])
    # del example["id"]
    # del example["topic"]
    # del example["dialogue"]
    # del example["summary"]
    # print("example:", prompt)
    # print("---")
    
    # return example

    # TODO the following code is probably wrong!!! even flan-t5 instructed model behaves bad
    # TODO: for decoder-only model, labels should be inputs+labels
    is_decoder_model = True
    if is_decoder_model:
        # print("examplesummary", example["dialogue"])
        input_texts = []
        for i in range(len(prompt)):
            summary = example["summary"][i]
            input_text = prompt[i] + summary            
            input_texts.append(input_text)
        # prompt = prompt + example["summary"]
        
        inputs = tokenizer(input_texts, padding=True, truncation=True, max_length=1024)
        
        # input_ids = inputs["input_ids"].squeeze()
        # attention_mask = inputs["attention_mask"].squeeze()

        # Create labels: the labels are the same as input, but shifted
        # labels = input_ids.clone()
        # # Shift labels by one token (this is how language models are trained)
        # labels[:-1] = input_ids[1:]

        
        inputs['labels'] = [ids[1:] + [tokenizer.pad_token_id] for ids in inputs['input_ids']] # labels are shifted version of inputs

        inputs = {k: torch.tensor(v) for k, v in inputs.items()} # convert to pyTorch tensor 

        # inputs: {"input_ids": tensor([[xxx, xxx, ...], [xxx, xxx, ...], ...]), 
        #           "attention_mask": tensor([[1, 1, 1, ..., 0, 0], [1, 1, 1, ...0, 0], ...]),
        #           "labels": tensor([[xxx, xxx, ...], [xxx, xxx, ...], ...])}
        # print(inputs)

        return inputs
    else:
        labels = example["summary"]
    
        # try to use this method to train flan-t5(see if produce irrelevant results), and test Qwen0.5B(not out of memory, see if produce irrelevant results)
        inputs = tokenizer(prompt, padding=True, truncation=True, max_length=1024, return_tensors = "pt")
        # print(inputs)
        targets = tokenizer(labels, padding=True, truncation=True, max_length=1024, return_tensors = "pt")
        inputs["labels"] = targets["input_ids"]  # Set the target tokens as the 'labels'
        del inputs["attention_mask"]

        # inputs: {"input_ids":tensor([[xxx, xxx, ...], [xxx, xxx, ...], ...]), "labels":tensor([[xxx, xxx, ...], [xxx, xxx, ...], ...])}
        return inputs

    # print(inputs)

    # print("hahaha", labels)
    
    # # TODO the following code is probably wrong!!! even flan-t5 instructed model behaves bad
    # # TODO: for decoder-only model, labels should be inputs+labels
    # is_decoder_model = False
    # if is_decoder_model:
    #     labels = []
    #     for i in range(len(prompt)):
    #         summary = example["summary"][i]
    #         label = prompt[i] + summary
    #         labels.append(label)
    # else:
    #     labels = example["summary"]
    
    # # try to use this method to train flan-t5(see if produce irrelevant results), and test Qwen0.5B(not out of memory, see if produce irrelevant results)
    # inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=1024, return_tensors = "pt")
    # targets = tokenizer(labels, padding="max_length", truncation=True, max_length=1024, return_tensors = "pt")
    # inputs["labels"] = targets["input_ids"]  # Set the target tokens as the 'labels'

    



# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
print("aa")
first_N_data = dataset["train"].select(range(0, 50)) # TODO
# print("first_N_data", first_N_data[0])
tokenized_datasets = first_N_data.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

aa


Map: 100%|██████████| 50/50 [00:00<00:00, 981.49 examples/s]


In [None]:
print(tokenized_datasets)

In [None]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [None]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

In [38]:
# output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
output_dir = f'./dialogue-summary-training'

training_args = TrainingArguments(
    output_dir=output_dir,
    save_strategy = "no", # TODO, now don't save checkpoints #"epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    # weight_decay=0.01,
    logging_steps=1,
    per_device_train_batch_size = 4, # As specified in the paper: batch_size: 32
    # max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets#['train'],
    # eval_dataset=tokenized_datasets['validation']
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ERROR: ld.so: object '/usr/lib64/libstdc++.so.6' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.


In [39]:
trainer.train()

Step,Training Loss
1,14.7846
2,5.7037
3,3.95
4,2.4858
5,2.6646
6,1.565
7,1.5967
8,1.8317
9,1.722
10,2.398


TrainOutput(global_step=39, training_loss=1.9716713917561066, metrics={'train_runtime': 2.8781, 'train_samples_per_second': 52.118, 'train_steps_per_second': 13.551, 'total_flos': 144626627059200.0, 'train_loss': 1.9716713917561066, 'epoch': 3.0})

In [40]:

original_model.to(torch.bfloat16)  # Convert to bfloat16
original_model.save_pretrained(output_dir, 
                        # save_function=torch.save,  # Use standard PyTorch save
                        # state_dict=model.state_dict(),  # Only save the model weights
                        # safe_serialization=True,  # More efficient serializationsave_optimizer_state=False
                     )
tokenizer.save_pretrained(output_dir,
                         # legacy_format=False  # Use newer, more efficient format
                     )

('./dialogue-summary-training/tokenizer_config.json',
 './dialogue-summary-training/special_tokens_map.json',
 './dialogue-summary-training/vocab.json',
 './dialogue-summary-training/merges.txt',
 './dialogue-summary-training/added_tokens.json',
 './dialogue-summary-training/tokenizer.json')

In [None]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained('./dialogue-summary-training', torch_dtype=torch.bfloat16)

In [42]:
instruct_model = AutoModelForCausalLM.from_pretrained('./dialogue-summary-training', torch_dtype=torch.bfloat16)

In [None]:
model_name = './dialogue-summary-training/checkpoint-500/'

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = False)
instruct_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code = False)

print(instruct_model.config.torch_dtype)
print(instruct_model.dtype)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
instruct_model.to(device)

In [None]:
print(original_model.name_or_path)
print(instruct_model.name_or_path)

In [48]:
# index = 200
# dialogue = dataset['test'][index]['dialogue']
# human_baseline_summary = dataset['test'][index]['summary']

# data = dataset['train'][3]
data = dataset["test"][200]
dialogue = data['dialogue']
human_baseline_summary = data['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

print(prompt)

original_model.to(device)
instruct_model.to(device)

inputs = tokenizer(prompt, return_tensors="pt").to(device)
input_ids = inputs.input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1), 
        attention_mask=inputs["attention_mask"],  # Use attention mask
        pad_token_id=tokenizer.eos_token_id  # Set pad token ID
        )
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(original_model.name_or_path)
print(instruct_model.name_or_path)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')


Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

Qwen/Qwen2.5-0.5B-instruct
./dialogue-summary-training
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#P