# Fine tune with full scale dataset

## Import and utilities

In [22]:
import os
import sys

import torch
import wandb

from datasets import Dataset, load_dataset
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from tqdm.auto import tqdm
from transformers import (AutoTokenizer, BitsAndBytesConfig,
                          DataCollatorForLanguageModeling, GPTNeoXForCausalLM,
                          Trainer, TrainingArguments, TrainerCallback)

In [3]:
current_dir = os.path.abspath(os.getcwd())

# Check if 'idl-project' is in the path
if 'idl-project' not in current_dir:
    raise Exception("Current directory '{current_dir}' is not within 'idl-project'")

print(f"✓ Working in '{current_dir}'")
print(f"✓ Directory contains 'idl-project'")

OUTPUT_DIR = "output/"

✓ Working in '/ocean/projects/cis250068p/iwiryadi/idl-project'
✓ Directory contains 'idl-project'


In [4]:
!git pull

Already up to date.


In [5]:
# Append paths for the src folder
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'idl-project')))

# Additional imports 
from src.model import load_fo_model
from src.data import load_flan_dataset, load_summarization_datasets
from src.utils import *

Using device: cuda


## Config and Variables

In [6]:
# Check CUDA
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla V100-SXM2-32GB


In [None]:
RUN_NAME = 'lora-tuning-1'

# Data parameters
# SUBSET_FRAC                 = 1
# FLAN_SUBSET                 = ['cot_zsopt_data']
SUBSET_FRAC                 = 1
FLAN_SUBSET                 = ['niv2_zsopt_data', 'cot_zsopt_data', 'dialog_zsopt_data']

# Training parameters
BATCH_SIZE                  = 8
GRADIENT_ACCUMULATION_STEPS = 8
LEARNING_RATE               = 1e-5
NUM_EPOCHS                  = 3
MAX_SEQ_LENGTH              = 1024
WARMUP_STEPS                = 1000
LOGGING_STEPS               = 100
SAVE_STEPS                  = 1000
EVAL_STEPS                  = 1000
SAVE_TOTAL_LIMIT            = 3
WEIGHT_DECAY                = 0.01
FP16                        = True

TEST_SIZE                   = 0.05

# BUFFER_SIZE                 = 25000  # For dataset processing


# LoRA configuration
LORA_RANK                   = 8
LORA_ALPHA                  = 32
LORA_DROPOUT                = 0.05
TARGET_MODULES              = ["query_key_value"]  # Target specific attention modules

In [35]:
# Define test prompts
test_prompts = [
    "Who is Barack Obama?",
    "What is Carnegie Mellon University?",
    "Classify this restaurant review sentiment: 'The food was absolutely delicious but the service was extremely slow and the waiter seemed uninterested in helping us.'",
    "Compare and contrast Carnegie Mellon University's Computer Science and Information Systems programs in terms of research focus and career outcomes.",
    "Summarize in one sentence: Dr. Sarah Chen, lead scientist on the mission, called it 'the most significant discovery in the history of space exploration.' The finding suggests that Mars once had a much more hospitable environment with liquid water and possibly a thicker atmosphere. The agency plans to send a sample return mission within the next five years to bring these fossils back to Earth for more detailed analysis. This discovery has profound implications for our understanding of how life might develop throughout the universe."
]

## Dataset Loading and Preparation

In [36]:
# dataset = load_summarization_datasets(subset_names=['niv2_zsopt_data', 'cot_zsopt_data', 'dialog_zsopt_data'])
dataset = load_summarization_datasets(subset_names=FLAN_SUBSET, subset_frac=SUBSET_FRAC)

  0%|          | 0/1 [00:00<?, ?it/s]

Processing cot_zsopt_data...
Dataset loaded successfully: cot_zsopt_data
<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>
Found summarization examples in cot_zsopt_data
Combined dataset with 248 summarization examples


##  LORA

### Configure BitsAndBytes for 4-bit quantization

In [37]:
# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",           # Use 4-bit NormalFloat quantization
    bnb_4bit_use_double_quant=True,      # Use double quantization for additional memory savings
    bnb_4bit_compute_dtype=torch.float32  # Compute in float32 (can also use torch.bfloat16 if available)
)


In [38]:
MODEL_ID       = "EleutherAI/pythia-160m-deduped"
MODEL_REVISION = "step143000"

In [39]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    revision=MODEL_REVISION,
    cache_dir=CACHE_DIR
)

# Ensure the tokenizer has padding token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the model with quantization
model = GPTNeoXForCausalLM.from_pretrained(
    MODEL_ID,
    revision=MODEL_REVISION,
    cache_dir=CACHE_DIR,
    quantization_config=bnb_config,
    device_map="auto"  # Automatically distribute layers across available GPUs
)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

In [40]:
# Define the LoRA configuration
# For Pythia models, the target module is "query_key_value" for attention layers
lora_config = LoraConfig(
    r=LORA_RANK,                    # Rank dimension
    lora_alpha=LORA_ALPHA,          # LoRA scaling factor
    target_modules=TARGET_MODULES,  # Target specific attention modules
    lora_dropout=LORA_DROPOUT,      # Dropout probability for LoRA layers
    bias="none",            # Don't apply LoRA to bias terms
    task_type="CAUSAL_LM"   # Task type for causal language modeling
)

# Apply LoRA to the model
peft_model = get_peft_model(model, lora_config)

In [41]:
# Print trainable parameters information
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(peft_model)

trainable params: 294912 || all params: 120150528 || trainable%: 0.25


##  Dataset Formatting and Tokenization

In [42]:
# Function to format the dataset for causal language modeling
def format_for_clm(examples):
    # Format as: "Instruction: {instruction} Input: {input} Output: {output}"
    # Adjust this format based on your specific dataset structure
    if 'inputs' in examples and 'targets' in examples:
        texts = [
            f"{inp}\n{target}{tokenizer.eos_token}"
            for inp, target in zip(examples['inputs'], examples['targets'])
        ]
    else:
        # Fallback for other dataset structures
        texts = examples['text'] if 'text' in examples else []
    
    return {"text": texts}

# Apply formatting 
if isinstance(dataset, Dataset):
    # For non-streaming datasets
    if 'inputs' in dataset.column_names and 'targets' in dataset.column_names:
        dataset = dataset.map(format_for_clm, batched=True, num_proc=4)
else:
    # For streaming datasets, we need to format each example as it comes
    dataset = dataset.map(lambda example: {
        'text': f"Instruction: {example['inputs']}\nOutput: {example['targets']}" 
        if 'inputs' in example and 'targets' in example 
        else example.get('text', '')
    })

Map (num_proc=4):   0%|          | 0/248 [00:00<?, ? examples/s]

In [43]:

# Tokenize function for the dataset
def tokenize_function(example):
    # Handle single examples for streaming datasets
    text = example["text"] if "text" in example else ""
    
    # Tokenize with padding and truncation
    outputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=1024,  # Adjust based on your needs and GPU memory
        return_tensors="pt"
    )
    
    # Remove the batch dimension for single examples
    for key in outputs:
        if isinstance(outputs[key], torch.Tensor) and outputs[key].ndim > 1:
            outputs[key] = outputs[key].squeeze(0)
    
    # Set labels equal to input_ids for causal language modeling
    outputs["labels"] = outputs["input_ids"].clone()
    
    return outputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function)

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [44]:
split_dataset = tokenized_dataset.train_test_split(test_size=TEST_SIZE)
train_dataset = split_dataset['train']
val_dataset   = split_dataset['test']  # Note: called 'test' by default

In [45]:
# # Since we're using streaming datasets, convert to list for training
# # This is needed because Trainer expects a non-streaming dataset
# # We'll create a buffer of examples for training
# buffer_size = 25000  # Adjust based on your memory constraints
# tokenized_examples = []
# for example in tqdm(tokenized_dataset, total=buffer_size):
#     tokenized_examples.append(example)
#     if len(tokenized_examples) >= buffer_size:
#         break

# print(f"Collected {len(tokenized_examples)} examples for training")

In [46]:
# # Convert to Dataset object for training
# from datasets import Dataset as HFDataset
# train_dataset = HFDataset.from_list(tokenized_examples)

# print(f"Training dataset created with columns: {train_dataset.column_names}")
# print(f"Number of examples: {len(train_dataset)}")

In [None]:
# Generation test callback
class GenerationTestCallback(TrainerCallback):
    """
    Callback to generate text samples at evaluation steps.
    """
    def __init__(self, tokenizer, test_prompts, max_length=100, do_sample=True, 
                 num_beams=2, temperature=0.1, device="cuda"):
        """
        Initialize with tokenizer and test prompts.
        """
        self.tokenizer    = tokenizer
        self.test_prompts = test_prompts
        self.device       = device
        self.max_length   = max_length
        self.do_sample    = do_sample
        self.num_beams    = num_beams
        self.temperature  = temperature
            
            
    def on_evaluate(self, args, state, control, model, **kwargs):
        """
        Run after each evaluation to generate two types of samples:
        1. Free-form completion with sampling
        2. Greedy decoding for deterministic output
        """
        print("GenerationTestCallback")
        model.eval()  # Set model to evaluation mode
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print("\n" + "="*50)
        print(f"Generating samples at step {state.global_step}:")
        print("="*50)
        
        with torch.no_grad():
            for prompt in self.test_prompts:
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                
                # 1. Generate with sampling
                sample_output_ids = model.generate(
                    inputs["input_ids"],
                    attention_mask    = inputs["attention_mask"],
                    max_length        = self.max_length,
                    do_sample         = True,
                    temperature       = self.temperature,
                    pad_token_id      = self.tokenizer.eos_token_id,
                    # eos_token_id      = self.tokenizer.eos_token_id
                )
                
                # 2. Generate with greedy decoding (deterministic) and no sampling
                greedy_output_ids = model.generate(
                    inputs["input_ids"],
                    attention_mask    = inputs["attention_mask"],
                    max_length        = self.max_length,
                    do_sample         = True,
                    pad_token_id      = self.tokenizer.eos_token_id,
                    # pad_token_id      = self.tokenizer.pad_token_id,
                    # eos_token_id      = self.tokenizer.eos_token_id
                )
                
                # Decode both outputs
                sampled_text = self.tokenizer.decode(sample_output_ids[0], skip_special_tokens=True)
                greedy_text  = self.tokenizer.decode(greedy_output_ids[0], skip_special_tokens=True)
                
                # Print the results
                print(f"\nPrompt: {prompt}")
                print(f"Greedy: {greedy_text}")
                print("-"*50)
                
                # Log to W&B if you're using it
                if args.report_to == "wandb":
                    import wandb
                    wandb.log({
                        f"generation/{prompt}/sampled": wandb.Html(
                            f"<b>Step {state.global_step}</b><br>"
                            f"<p><b>Prompt:</b> {prompt}</p>"
                            f"<p><b>Sampled:</b> {sampled_text}</p>"
                        ),
                        f"generation/{prompt}/greedy": wandb.Html(
                            f"<b>Step {state.global_step}</b><br>"
                            f"<p><b>Prompt:</b> {prompt}</p>"
                            f"<p><b>Greedy:</b> {greedy_text}</p>"
                        )
                    }, step=state.global_step)
        
        return control

# Create generation callback
generation_callback = GenerationTestCallback(
    tokenizer     = tokenizer,
    test_prompts  = test_prompts,
    max_length    = 250,
    num_beams     = 3,
    temperature   = 0.1,
)

## Train

In [52]:
import wandb
wandb.init(entity="11785_finetuning", project='ivan-testing', name=RUN_NAME, reinit=True)

0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,█▁▁
train/global_step,▁██

0,1
eval/loss,3.26039
eval/runtime,0.165
eval/samples_per_second,78.807
eval/steps_per_second,12.124
total_flos,41896803041280.0
train/epoch,3.26667
train/global_step,10.0
train_loss,3.137
train_runtime,3.0502
train_samples_per_second,18.032


In [55]:
# Set format to PyTorch
train_dataset.set_format(type="torch")
val_dataset.set_format(type="torch")

# Create training arguments with parameters
training_args = TrainingArguments(
    per_device_train_batch_size  = BATCH_SIZE,
    per_device_eval_batch_size   = BATCH_SIZE,
    gradient_accumulation_steps  = GRADIENT_ACCUMULATION_STEPS,
    warmup_steps                 = WARMUP_STEPS,
    num_train_epochs             = NUM_EPOCHS,
    learning_rate                = LEARNING_RATE,
    fp16                         = FP16,
    logging_steps                = LOGGING_STEPS,
    save_steps                   = SAVE_STEPS,
    eval_strategy                = "steps",
    eval_steps                   = EVAL_STEPS,
    output_dir                   = OUTPUT_DIR,
    optim                        = "paged_adamw_8bit", 
    save_total_limit             = SAVE_TOTAL_LIMIT,
    
    report_to                    = "wandb",
    weight_decay                 = WEIGHT_DECAY,
    # load_best_model_at_end       = True,
    # metric_for_best_model        = "eval_loss",
    # greater_is_better            = False,
)


# Set up the trainer with validation
trainer = Trainer(
    model                     = peft_model,
    args                      = training_args,
    train_dataset             = train_dataset,
    eval_dataset              = val_dataset,
    data_collator             = DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics           = None,  
    callbacks                 = [generation_callback],  

)


# Disable caching during training to avoid memory issues
model.config.use_cache = False

# Start training
print("Starting training...")
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
10,No log,3.263658


GenerationTestCallback

Generating samples at step 10:

Prompt: Who is Barack Obama?
Greedy: Who is Barack Obama? The answer is just a blank red marker.

What can Obama say about Democrats?

What do Obama say? If you look at Barack, he says things as though he says Obama. Obama is not even sure that the first thing that Barack says, he is not sure Obama does it.

Do we know what Obama says? Obama tells us all about the world and Obama to that we are in the world.

As for Obama? He doesn
--------------------------------------------------

Prompt: What is Carnegie Mellon University?
Greedy: What is Carnegie Mellon University?

There is a big difference between education, particularly in high schools (in the Carnegie Mellon University system), and academic performance, which is measured by math and science, education. Learning for the school-going school has always been a part of life and can be valued in a variety of ways, such as at school or business. Thus, to learn from someone with e

TrainOutput(global_step=15, training_loss=4.202545674641927, metrics={'train_runtime': 48.4882, 'train_samples_per_second': 24.233, 'train_steps_per_second': 0.309, 'total_flos': 804418618392576.0, 'train_loss': 4.202545674641927, 'epoch': 4.8})

In [None]:
# Save the final model
peft_model.save_pretrained(f"{OUTPUT_DIR}/{RUN_NAME}")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/{RUN_NAME}")

print("Training complete and model saved!")

In [None]:
def load_and_test_model():
    print("\nTesting fine-tuned model:")
    
    # Load the base model and LoRA adapter
    # base_model = GPTNeoXForCausalLM.from_pretrained(
    #     MODEL_ID,
    #     revision=MODEL_REVISION,
    #     cache_dir=CACHE_DIR,
    #     device_map="auto"
    # )
    
    # # Load and apply the fine-tuned LoRA weights
    # fine_tuned_model = PeftModel.from_pretrained(
    #     base_model, 
    #     f"{OUTPUT_DIR}/lora_model",
    #     device_map="auto"
    # )
    
    fine_tuned_model = peft_model
    
    # Test the model with the prompts
    for test_input_string in test_prompts:
        inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
        # print(tokens[0])
        tokens = fine_tuned_model.generate(
            **inputs, 
            max_length=100, 
            pad_token_id=tokenizer.eos_token_id
        )
        print("Without sampling: " + tokenizer.decode(tokens[0], skip_special_tokens=True))
        
        print("---------------")
        tokens = fine_tuned_model.generate(
            **inputs, 
            max_length=100, 
            pad_token_id=tokenizer.eos_token_id, 
            do_sample=True,
        )
        print("With sampling   : " + tokenizer.decode(tokens[0], skip_special_tokens=True))
        
        print("\n===============")
    
load_and_test_model()

In [56]:
from src.utils import * 
article = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
summary = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_summary = "Daniel Craig is recasted as James Bond again"
# In normal, query is sentence/article, and answer is summary/highlight (S->A direction)
base = calculate_score(summary, article, model, tokenizer, backward=False, query_direction="reverse", debug=True)

print(base['normalized_log_prob'], base['perplexity'])

Settings: False reverse
Context: Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.is a summary of
Target: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday
Full sentence: Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.is a summary ofHarry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday


-3.3839298650308702 29.48642137573805


  return fn(*args, **kwargs)


In [57]:
wandb.finish()

0,1
eval/loss,█▁
eval/runtime,▁▁
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▁█
train/global_step,▁▁█

0,1
eval/loss,3.26366
eval/runtime,0.1651
eval/samples_per_second,78.747
eval/steps_per_second,12.115
total_flos,804418618392576.0
train/epoch,4.8
train/global_step,15.0
train_loss,4.20255
train_runtime,48.4882
train_samples_per_second,24.233
