Resources:
- [Medium blog post](https://medium.com/@tejpal.abhyuday/optimizing-language-model-fine-tuning-with-peft-qlora-integration-and-training-time-reduction-04df39dca72b)

In [None]:
import os
import time
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from utils import json_to_dataframe, json_to_string_list

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)

Load the dataset

In [None]:
filepath = '../../data/vector_veterinary_imaging_2.json'

df = json_to_dataframe(filepath) 
rad_strings = json_to_string_list(filepath)

In [None]:
# class GPTDatasetV1(Dataset):
#     def __init__(self, articles, tokenizer, max_length, stride):
#         self.input_ids = []
#         self.target_ids = []

#         # Get the token ID for <|endoftext|>
#         # endoftext_token = tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})[0]

#         endoftext_token = tokenizer.eos_token_id
#         if endoftext_token is None:
#             print('No end of text token included, creating one')
#             # Handle the case where the model might not use "<|endoftext|>"
#             endoftext_token = tokenizer.encode("<|endoftext|>", add_special_tokens=False)[0]

#         # Tokenize all articles with end-of-text token
#         all_tokens = []
#         for article in articles:
#             article_tokens = tokenizer.encode(article, allowed_special={"<|endoftext|>"})
#             all_tokens.extend(article_tokens + [endoftext_token])

#         # Use a sliding window to chunk the tokens into overlapping sequences of max_length
#         for i in range(0, len(all_tokens) - max_length, stride):
#             input_chunk = all_tokens[i:i + max_length]
#             target_chunk = all_tokens[i + 1: i + max_length + 1]
#             self.input_ids.append(torch.tensor(input_chunk))
#             self.target_ids.append(torch.tensor(target_chunk))

#     def __len__(self):
#         return len(self.input_ids)

#     def __getitem__(self, idx):
#         return self.input_ids[idx], self.target_ids[idx]

In [None]:
class GPTDatasetV1(Dataset):
    def __init__(self, articles, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Get the token ID for <|endoftext|>
        endoftext_token = tokenizer.eos_token_id
        if endoftext_token is None:
            print('No end of text token included, creating one')
            endoftext_token = tokenizer.encode("<|endoftext|>", add_special_tokens=False)[0]

        # Tokenize all articles with end-of-text token
        all_tokens = []
        for article in articles:
            # Remove 'allowed_special' since it's not recognized
            article_tokens = tokenizer.encode(article, add_special_tokens=False)
            all_tokens.extend(article_tokens + [endoftext_token])

        # Use a sliding window to chunk the tokens into overlapping sequences of max_length
        for i in range(0, len(all_tokens) - max_length, stride):
            input_chunk = all_tokens[i:i + max_length]
            target_chunk = all_tokens[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [None]:
class GPTDatasetV2(Dataset):
    def __init__(self, articles, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride

        self.examples = []
        self._prepare_examples(articles)

        print(f"Dataset initialized with {len(self.examples)} examples.")

    def _prepare_examples(self, articles):
        print(f"Preparing examples from {len(articles)} articles.")
        
        # Concatenate all articles with end-of-text token
        all_token_ids = []
        for article in articles:
            article_tokens = self.tokenizer.encode(article, add_special_tokens=False)
            all_token_ids.extend(article_tokens + [self.tokenizer.eos_token_id])
        
        all_token_ids = torch.tensor(all_token_ids)
        
        # Create chunks of max_length with stride
        for i in range(0, len(all_token_ids) - self.max_length + 1, self.stride):
            chunk = all_token_ids[i:i + self.max_length]
            self.examples.append(chunk)

        print(f"Created {len(self.examples)} examples.")
        print(f"max_length: {self.max_length}, stride: {self.stride}")
        print(f"Total concatenated length: {len(all_token_ids)}")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        if idx >= len(self.examples):
            raise IndexError(f"Index {idx} out of range for dataset with {len(self.examples)} examples.")
        
        input_ids = self.examples[idx]
        attention_mask = torch.ones_like(input_ids)
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids.clone()
        }

In [None]:
def create_dataset_v2(articles, tokenizer, max_length, stride):
    return GPTDatasetV2(articles, tokenizer, max_length, stride)

def create_dataloader_v2(dataset, batch_size=4, shuffle=True, drop_last=False, num_workers=0):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

In [None]:
# Code below is to help debug any issues with the dataset or data loader

# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import DataCollatorForLanguageModeling

# class GPTDatasetV4(Dataset):
#     def __init__(self, articles, tokenizer, max_length, stride):
#         self.tokenizer = tokenizer
#         self.max_length = max_length
#         self.stride = stride

#         self.examples = []
#         self._prepare_examples(articles)

#     def _prepare_examples(self, articles):
#         for article in articles:
#             encodings = self.tokenizer(article, add_special_tokens=False, return_tensors="pt")
#             input_ids = encodings['input_ids'].squeeze()
            
#             # Use a sliding window to chunk the tokens into overlapping sequences
#             for i in range(0, len(input_ids) - self.max_length + 1, self.stride):
#                 chunk = input_ids[i:i + self.max_length]
#                 self.examples.append(chunk)

#         if not self.examples:
#             raise ValueError("No valid examples were generated. Check your input data and parameters.")

#     def __len__(self):
#         return len(self.examples)

#     def __getitem__(self, idx):
#         input_ids = self.examples[idx]
#         attention_mask = torch.ones_like(input_ids)
        
#         return {
#             "input_ids": input_ids,
#             "attention_mask": attention_mask,
#             "labels": input_ids.clone()
#         }

# def create_dataset_v4(articles, tokenizer, max_length, stride):
#     try:
#         return GPTDatasetV4(articles, tokenizer, max_length, stride)
#     except ValueError as e:
#         print(f"Error creating dataset: {e}")
#         return None

# def create_dataloader_v4(dataset, batch_size=4, shuffle=True, drop_last=False, num_workers=0):
#     if dataset is None or len(dataset) == 0:
#         print("Dataset is empty or None. Cannot create DataLoader.")
#         return None

#     return DataLoader(
#         dataset,
#         batch_size=batch_size,
#         shuffle=shuffle,
#         drop_last=drop_last,
#         num_workers=num_workers,
#         collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False)
#     )

# # Usage:
# train_dataset = create_dataset_v4(
#     articles=train_data,
#     tokenizer=tokenizer,
#     max_length=max_length,
#     stride=stride
# )

# val_dataset = create_dataset_v4(
#     articles=val_data,
#     tokenizer=tokenizer,
#     max_length=max_length,
#     stride=stride
# )

# if train_dataset:
#     train_loader = create_dataloader_v4(
#         dataset=train_dataset,
#         batch_size=training_batch_size,
#         shuffle=True,
#         drop_last=True,
#         num_workers=0
#     )
# else:
#     print("Failed to create train_loader due to empty dataset.")

# if val_dataset:
#     val_loader = create_dataloader_v4(
#         dataset=val_dataset,
#         batch_size=training_batch_size,
#         shuffle=False,
#         drop_last=False,
#         num_workers=0
#     )
# else:
#     print("Failed to create val_loader due to empty dataset.")

# # For use with PEFT trainer (only if datasets are not None):
# if train_dataset and val_dataset:
#     peft_trainer = transformers.Trainer(
#         model=peft_model,
#         train_dataset=train_dataset,
#         eval_dataset=val_dataset,
#         args=peft_training_args,
#         data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
#     )

#     peft_trainer.train()
# else:
#     print("Cannot create PEFT trainer due to empty dataset(s).")


# print(f"Number of training articles: {len(train_data)}")
# print(f"Number of validation articles: {len(val_data)}")
# print(f"Length of shortest training article: {min(len(article) for article in train_data)}")
# print(f"Length of shortest validation article: {min(len(article) for article in val_data)}")
# print(f"max_length: {max_length}")
# print(f"stride: {stride}")

## Configure quantization

In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

## Load the model

In [None]:
# Automatically detect and use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set up the device map
if torch.cuda.is_available():
    device_map = "auto"  # This will automatically distribute the model across available GPUs
else:
    device_map = {"": device}  # Use the detected device (CPU in this case)

In [None]:
# Huggingface login (if required)

from huggingface_hub import notebook_login
notebook_login()

In [None]:
# model_name = 'microsoft/phi-2'
# model_name = 'microsoft/phi-1_5'
# model_name = 'microsoft/Phi-3.5-mini-instruct'
model_name = 'google/gemma-2-9b'

In [None]:
# Define the path where the model will be saved locally
local_model_path = os.path.join('../..', 'models', model_name.replace('/', '-'))

# Check if the model exists locally
if os.path.exists(local_model_path):
    print(f"Loading model from local path: {local_model_path}")
    original_model = AutoModelForCausalLM.from_pretrained(
        local_model_path,
        device_map=device_map,
        quantization_config=bnb_config,
        trust_remote_code=True
    )
else:
    print(f"Downloading model from {model_name}")
    original_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=bnb_config,
        trust_remote_code=True
    )
    # Save the model locally
    original_model.save_pretrained(local_model_path)
    print(f"Model saved to {local_model_path}")

Load the tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

## Test zero-shot model

In [None]:
%%time
from transformers import set_seed
seed = 42
set_seed(seed)

In [None]:
def create_dataloader_v1(articles, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):

    # Create dataset
    dataset = GPTDatasetV1(articles, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

In [None]:
index = 0

In [None]:
rad_strings[index]

In [None]:
def generate_text(model, tokenizer, prompt, max_new_tokens=50, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.95):
    # Encode the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and return the generated text
    generated_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output]
    return generated_texts

In [None]:
# Your starting text
prompt = "Findings: Orthogonal pelvis and orthogonal right shoulder and lateral left shoulder images"

# Generate text
generated_texts = generate_text(original_model, tokenizer, prompt, max_new_tokens=50)

# Print the generated text
for i, text in enumerate(generated_texts):
    print(f"Generated text {i+1}:")
    print(text)
    print()

Snippet from actual text

In [None]:
rad_strings[index][:300]

## Create dataset

In [None]:
context_length = original_model.config.max_position_embeddings
print(f"Context length: {context_length}")

In [None]:
train_ratio = 0.90
split_idx = int(train_ratio * len(rad_strings))
train_data = rad_strings[:split_idx]
val_data = rad_strings[split_idx:]

In [None]:
training_batch_size = 8
max_length = context_length
stride = 128

In [None]:
train_loader = create_dataloader_v1(
    train_data,
    batch_size=training_batch_size,
    max_length=max_length,
    stride=stride,
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=training_batch_size,
    max_length=max_length,
    stride=stride,
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
len(train_data)

In [None]:
len(val_data)

In [None]:
train_dataset = create_dataset_v2(
    articles=train_data,
    tokenizer=tokenizer,
    max_length=max_length,
    stride=stride
)

val_dataset = create_dataset_v2(
    articles=val_data,
    tokenizer=tokenizer,
    max_length=max_length,
    stride=stride
)

# train_loader = create_dataloader_v2(
#     dataset=train_dataset,
#     batch_size=training_batch_size,
#     shuffle=True,
#     drop_last=True,
#     num_workers=0
# )

# val_loader = create_dataloader_v2(
#     dataset=val_dataset,
#     batch_size=training_batch_size,
#     shuffle=False,
#     drop_last=False,
#     num_workers=0
# )

## Preparing the model for QLoRA

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
# 2 - Using the prepare_model_for_kbit_training method from PEFT
# Preparing the Model for QLoRA
original_model = prepare_model_for_kbit_training(original_model)

### Setup PEFT for Fine-Tuning

# TODO clean up below and `peft_training_args`

- So that once a model is selected, that determines a set of config values.
- If gemma 2 9B still doesn't work with reduced parameters, try using 2B version instead.

In [None]:
# For phi model

config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()
peft_model = get_peft_model(original_model, config)

In [None]:
# For gemma 9b

config = LoraConfig(
    r=16,  # Reduced rank
    lora_alpha=16,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()
peft_model = get_peft_model(original_model, config)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [None]:
print(print_number_of_trainable_model_parameters(peft_model))

## Train PEFT Adapter
Define training arguments and create Trainer instance.

In [None]:
output_dir = f'./peft-radiology-training-{str(int(time.time()))}'

In [None]:
# For phi model

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)
peft_model.config.use_cache = False
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
# For gemma 9b

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)
peft_model.config.use_cache = False
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
# Below is an alternative set of training parameters

# peft_training_args = TrainingArguments(
#     output_dir=output_dir,
#     warmup_steps=100,  # Increased from 1
#     per_device_train_batch_size=4,  # Increased from 1
#     gradient_accumulation_steps=4,
#     max_steps=2000,  # Increased from 1000
#     learning_rate=3e-4,  # Slightly increased
#     optim="paged_adamw_8bit",
#     logging_steps=50,  # Adjusted
#     logging_dir="./logs",
#     save_strategy="steps",
#     save_steps=50,  # Adjusted
#     evaluation_strategy="steps",
#     eval_steps=50,  # Adjusted
#     do_eval=True,
#     gradient_checkpointing=True,
#     report_to="none",
#     overwrite_output_dir='True',
#     group_by_length=True,
#     fp16=True,  # Added for mixed precision training
#     weight_decay=0.01,  # Added for regularization
#     lr_scheduler_type="cosine",  # Added for better learning rate scheduling
# )

In [None]:
peft_trainer.train()

## Load the PEFT model

In [None]:
from peft import PeftModel

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(local_model_path, 
                                                  device_map=device_map,
                                                  quantization_config=bnb_config,
                                                  trust_remote_code=True
                                                 )

In [None]:
eval_tokenizer = AutoTokenizer.from_pretrained(model_name, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [None]:
# Load the tokenizer
eval_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_bos_token=True,
    trust_remote_code=True,
    add_eos_token=True,
    use_fast=False
)
eval_tokenizer.pad_token = tokenizer.eos_token

In [None]:
ft_model = PeftModel.from_pretrained(base_model, "./peft-radiology-training-1725673449/checkpoint-225",torch_dtype=torch.float16,is_trainable=False)


In [None]:
# Your starting text
prompt = "Findings: Orthogonal pelvis and orthogonal right shoulder and lateral left shoulder images"

# Generate text
generated_texts = generate_text(ft_model, eval_tokenizer, prompt, max_new_tokens=50)

# Print the generated text
for i, text in enumerate(generated_texts):
    print(f"Generated text {i+1}:")
    print(text)
    print()

Snippet from actual text

In [None]:
rad_strings[index][:300]