# Fine tune with full scale dataset

## Import and utilities

In [1]:
# Append paths for the src folder
import sys
import os 
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'idl-project')))

# Additional imports 
from src.utils import *


In [2]:
from src.model import load_fo_model
from src.data import load_flan_dataset, load_summarization_datasets

Using device: cuda


In [3]:
import torch
import wandb

from datasets import Dataset, load_dataset
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from tqdm.auto import tqdm
from transformers import (AutoTokenizer, BitsAndBytesConfig,
                          DataCollatorForLanguageModeling, GPTNeoXForCausalLM,
                          Trainer, TrainingArguments, TrainerCallback)

In [4]:
current_dir = os.path.abspath(os.getcwd())

# Check if 'idl-project' is in the path
if 'idl-project' not in current_dir:
    raise Exception("Current directory '{current_dir}' is not within 'idl-project'")

print(f"✓ Working in '{current_dir}'")
print(f"✓ Directory contains 'idl-project'")

OUTPUT_DIR = "output/"

✓ Working in '/ocean/projects/cis250068p/iwiryadi/idl-project'
✓ Directory contains 'idl-project'


In [4]:
from notebooks.finetune_lora_config import *

In [5]:
RUN_NAME

'full-tuning-10-more-data-twice'

In [6]:
!git pull

Already up to date.


In [7]:
CACHE_DIR

'/ocean/projects/cis250068p/shared/caches'

## Config and Variables

In [9]:
import wandb
wandb.init(entity="11785_finetuning", project='ivan-testing-team', name=RUN_NAME, reinit=True)
wandb.save("notebooks/finetune_lora_config.py")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33miwiryadi[0m ([33midl-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


['/ocean/projects/cis250068p/iwiryadi/idl-project/wandb/run-20250418_121511-w3nm6epq/files/notebooks/finetune_lora_config.py']

In [8]:
# Check CUDA
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA H100 80GB HBM3


In [10]:
# Define test prompts
test_prompts = [
    "Who is Barack Obama?",
    "What is Carnegie Mellon University?",
    "Classify this restaurant review sentiment: 'The food was absolutely delicious but the service was extremely slow and the waiter seemed uninterested in helping us.'",
    "Compare and contrast Carnegie Mellon University's Computer Science and Information Systems programs in terms of research focus and career outcomes.",
    "Summarize in one sentence: Dr. Sarah Chen, lead scientist on the mission, called it 'the most significant discovery in the history of space exploration.' The finding suggests that Mars once had a much more hospitable environment with liquid water and possibly a thicker atmosphere. The agency plans to send a sample return mission within the next five years to bring these fossils back to Earth for more detailed analysis. This discovery has profound implications for our understanding of how life might develop throughout the universe."
]

## Dataset Loading and Preparation

In [11]:
# dataset = load_summarization_datasets(subset_names=['niv2_zsopt_data', 'cot_zsopt_data', 'dialog_zsopt_data'])
dataset = load_summarization_datasets(
    subset_names=FLAN_SUBSET, subset_frac=SUBSET_FRAC, p=TASK_DIVERSITY_P)

  0%|          | 0/5 [00:00<?, ?it/s]

Processing niv2_zsopt_data...
Downloading data to /ocean/projects/cis250068p/shared/caches


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Dataset loaded successfully: niv2_zsopt_data
Original niv2_zsopt_data size: 5030900


Filter (num_proc=20):   0%|          | 0/5030900 [00:00<?, ? examples/s]

Found summarization examples in niv2_zsopt_data
Processing flan_zsopt_data...
Downloading data to /ocean/projects/cis250068p/shared/caches


Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

Dataset loaded successfully: flan_zsopt_data
Original flan_zsopt_data size: 38970972


Filter (num_proc=20):   0%|          | 0/38970972 [00:00<?, ? examples/s]

Found summarization examples in flan_zsopt_data
Processing t0_zsopt_data...
Downloading data to /ocean/projects/cis250068p/shared/caches


Resolving data files:   0%|          | 0/131 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/131 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/44 [00:00<?, ?it/s]

Dataset loaded successfully: t0_zsopt_data
Original t0_zsopt_data size: 41652381


Filter (num_proc=20):   0%|          | 0/41652381 [00:00<?, ? examples/s]

Found summarization examples in t0_zsopt_data
Processing cot_zsopt_data...
Downloading data to /ocean/projects/cis250068p/shared/caches
Dataset loaded successfully: cot_zsopt_data
Original cot_zsopt_data size: 95570


Filter (num_proc=20):   0%|          | 0/95570 [00:00<?, ? examples/s]

Found summarization examples in cot_zsopt_data
Processing dialog_zsopt_data...
Downloading data to /ocean/projects/cis250068p/shared/caches
Dataset loaded successfully: dialog_zsopt_data
Original dialog_zsopt_data size: 2715160


Filter (num_proc=20):   0%|          | 0/2715160 [00:00<?, ? examples/s]

Found summarization examples in dialog_zsopt_data
Combined dataset with 92303 summarization examples


In [18]:
for i in range(10):
    print(i)
    # print(dataset[i]['inputs'])
    # print(dataset[i]['targets'])
    print(dataset[i]['_task_name'])
    print("="*80)

0
task510_reddit_tifu_title_summarization
1
task618_amazonreview_summary_text_generation
2
task1579_gigaword_incorrect_summarization
3
task1540_parsed_pdfs_summarization
4
task590_amazonfood_summary_correction_classification
5
task1658_billsum_summarization
6
task1290_xsum_summarization
7
task618_amazonreview_summary_text_generation
8
task510_reddit_tifu_title_summarization
9
task511_reddit_tifu_long_text_summarization


In [19]:
dataset[i]['inputs']

"Instructions: In this task, you are given a Reddit post as a text. Your task is to generate a short summary for this text. The summary must include a situation which caused humor. The summary should be one or two sentences long.\nInput: Text: tifu (it was actually a few weeks ago) and i can never see my favorite friend or beverage in the same way.\ni was just hanging out with a friend - she's really awesome and one of my favorite people to spend time with, so we were having a great time playing pokémon stadium on her n64. she's my type of girl, so there is an endless stream of dr. pepper filling our cups which we kept on the floor by our feet. \ni take my last drink of the night, not knowing that it would change my life forever.\ni notice something solid enter my mouth. at this point, apparently, i am optimistic. i use my tongue to move the object around and attempt to identify it by pressing it against my teeth, cheek, etc. is it a mostly melted ice cube? maybe some sort of condensed

In [20]:
dataset[i]['targets']

'left my soda on the floor, last drink had a roach in it, said roach basically ruined my life.'

##  LORA

### Configure BitsAndBytes for 4-bit quantization

In [21]:
# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",           # Use 4-bit NormalFloat quantization
    bnb_4bit_use_double_quant=True,      # Use double quantization for additional memory savings
    bnb_4bit_compute_dtype=torch.float32  # Compute in float32 (can also use torch.bfloat16 if available)
)


In [22]:
MODEL_ID       = "EleutherAI/pythia-160m-deduped"
MODEL_REVISION = "step143000"

In [23]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    revision=MODEL_REVISION,
    cache_dir=CACHE_DIR
)

# Ensure the tokenizer has padding token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


if FT_TYPE == 'full':
    model = GPTNeoXForCausalLM.from_pretrained(
        MODEL_ID,
        revision=MODEL_REVISION,
        cache_dir=CACHE_DIR,
        # quantization_config=bnb_config,
        device_map="auto"  # Automatically distribute layers across available GPUs
    )
else:
    # Load the model with quantization
    model = GPTNeoXForCausalLM.from_pretrained(
        MODEL_ID,
        revision=MODEL_REVISION,
        cache_dir=CACHE_DIR,
        quantization_config=bnb_config,
        device_map="auto"  # Automatically distribute layers across available GPUs
    )

# Prepare the model for k-bit training
# model = prepare_model_for_kbit_training(model)

In [24]:
# Print trainable parameters information
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [25]:
if FT_TYPE == 'lora': 
    # Define the LoRA configuration
    # For Pythia models, the target module is "query_key_value" for attention layers
    lora_config = LoraConfig(
        r=LORA_RANK,                    # Rank dimension
        lora_alpha=LORA_ALPHA,          # LoRA scaling factor
        target_modules=TARGET_MODULES,  # Target specific attention modules
        lora_dropout=LORA_DROPOUT,      # Dropout probability for LoRA layers
        bias="none",            # Don't apply LoRA to bias terms
        task_type="CAUSAL_LM"   # Task type for causal language modeling
    )

    # Apply LoRA to the model
    model = get_peft_model(model, lora_config)
    
    print_trainable_parameters(model)

##  Dataset Formatting and Tokenization

In [26]:
# Function to format the dataset for causal language modeling
def format_for_clm(examples):
    # Format as: "Instruction: {instruction} Input: {input} Output: {output}"
    # Adjust this format based on your specific dataset structure
    if 'inputs' in examples and 'targets' in examples:
        texts = [
            f"{inp}\n{target}{tokenizer.eos_token}"
            for inp, target in zip(examples['inputs'], examples['targets'])
        ]
    else:
        # Fallback for other dataset structures
        texts = examples['text'] if 'text' in examples else []
    
    return {"text": texts}

# Apply formatting 
if isinstance(dataset, Dataset):
    # For non-streaming datasets
    if 'inputs' in dataset.column_names and 'targets' in dataset.column_names:
        dataset = dataset.map(format_for_clm, batched=True, num_proc=16)
else:
    # For streaming datasets, we need to format each example as it comes
    dataset = dataset.map(lambda example: {
        'text': f"Instruction: {example['inputs']}\nOutput: {example['targets']}" 
        if 'inputs' in example and 'targets' in example 
        else example.get('text', '')
    })

In [27]:
# Tokenize function for the dataset
def tokenize_function(example):
    # Handle single examples for streaming datasets
    text = example["text"] if "text" in example else ""
    
    # Tokenize with padding and truncation
    outputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=1024,  # Adjust based on your needs and GPU memory
        return_tensors="pt"
    )
    
    # Remove the batch dimension for single examples
    for key in outputs:
        if isinstance(outputs[key], torch.Tensor) and outputs[key].ndim > 1:
            outputs[key] = outputs[key].squeeze(0)
    
    # Set labels equal to input_ids for causal language modeling
    outputs["labels"] = outputs["input_ids"].clone()
    return outputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, num_proc=16)

In [28]:
split_dataset = tokenized_dataset.train_test_split(test_size=TEST_SIZE)
train_dataset = split_dataset['train']
val_dataset   = split_dataset['test']  # Note: called 'test' by default

In [29]:
# # Since we're using streaming datasets, convert to list for training
# # This is needed because Trainer expects a non-streaming dataset
# # We'll create a buffer of examples for training
# buffer_size = 25000  # Adjust based on your memory constraints
# tokenized_examples = []
# for example in tqdm(tokenized_dataset, total=buffer_size):
#     tokenized_examples.append(example)
#     if len(tokenized_examples) >= buffer_size:
#         break

# print(f"Collected {len(tokenized_examples)} examples for training")

In [30]:
# # Convert to Dataset object for training
# from datasets import Dataset as HFDataset
# train_dataset = HFDataset.from_list(tokenized_examples)

# print(f"Training dataset created with columns: {train_dataset.column_names}")
# print(f"Number of examples: {len(train_dataset)}")

In [31]:
# Generation test callback
class GenerationTestCallback(TrainerCallback):
    """
    Callback to generate text samples at evaluation steps.
    """
    def __init__(self, tokenizer, test_prompts, max_length=100, do_sample=True, 
                 num_beams=2, temperature=0.1, device="cuda"):
        """
        Initialize with tokenizer and test prompts.
        """
        self.tokenizer    = tokenizer
        self.test_prompts = test_prompts
        self.device       = device
        self.max_length   = max_length
        self.do_sample    = do_sample
        self.num_beams    = num_beams
        self.temperature  = temperature
            
            
    def on_evaluate(self, args, state, control, model, **kwargs):
        """
        Run after each evaluation to generate two types of samples:
        1. Free-form completion with sampling
        2. Greedy decoding for deterministic output
        """
        print("GenerationTestCallback")
        model.eval()  # Set model to evaluation mode
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print("\n" + "="*50)
        print(f"Generating samples at step {state.global_step}:")
        print("="*50)
        
        with torch.no_grad():
            for prompt in self.test_prompts:
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                
                # 1. Generate with sampling
                sample_output_ids = model.generate(
                    inputs["input_ids"],
                    attention_mask    = inputs["attention_mask"],
                    max_length        = self.max_length,
                    do_sample         = True,
                    temperature       = self.temperature,
                    pad_token_id      = self.tokenizer.eos_token_id,
                    # eos_token_id      = self.tokenizer.eos_token_id
                )
                
                # 2. Generate with greedy decoding (deterministic) and no sampling
                greedy_output_ids = model.generate(
                    inputs["input_ids"],
                    attention_mask    = inputs["attention_mask"],
                    max_length        = self.max_length,
                    do_sample         = True,
                    pad_token_id      = self.tokenizer.eos_token_id,
                    # pad_token_id      = self.tokenizer.pad_token_id,
                    # eos_token_id      = self.tokenizer.eos_token_id
                )
                
                # Decode both outputs
                sampled_text = self.tokenizer.decode(sample_output_ids[0], skip_special_tokens=True)
                greedy_text  = self.tokenizer.decode(greedy_output_ids[0], skip_special_tokens=True)
                
                # Print the results
                print(f"\nPrompt: {prompt}")
                print(f"Greedy: {greedy_text}")
                print("-"*50)
                
                # Log to W&B if you're using it
                if args.report_to == "wandb":
                    import wandb
                    wandb.log({
                        f"generation/{prompt}/sampled": wandb.Html(
                            f"<b>Step {state.global_step}</b><br>"
                            f"<p><b>Prompt:</b> {prompt}</p>"
                            f"<p><b>Sampled:</b> {sampled_text}</p>"
                        ),
                        f"generation/{prompt}/greedy": wandb.Html(
                            f"<b>Step {state.global_step}</b><br>"
                            f"<p><b>Prompt:</b> {prompt}</p>"
                            f"<p><b>Greedy:</b> {greedy_text}</p>"
                        )
                    }, step=state.global_step)
        
        return control

# Create generation callback
generation_callback = GenerationTestCallback(
    tokenizer     = tokenizer,
    test_prompts  = test_prompts,
    max_length    = 250,
    num_beams     = 3,
    temperature   = 0.1,
)

## Train

In [None]:
# Set format to PyTorch
train_dataset.set_format(type="torch")
val_dataset.set_format(type="torch")

# Create training arguments with parameters
training_args = TrainingArguments(
    per_device_train_batch_size  = BATCH_SIZE,
    per_device_eval_batch_size   = BATCH_SIZE,
    gradient_accumulation_steps  = GRADIENT_ACCUMULATION_STEPS,
    warmup_ratio                 = WARMUP_RATIO,
    num_train_epochs             = NUM_EPOCHS,
    learning_rate                = LEARNING_RATE,
    lr_scheduler_type            = LR_SCHEDULER_TYPE,
    fp16                         = FP16,
    logging_steps                = LOGGING_STEPS,
    save_steps                   = SAVE_STEPS,
    eval_strategy                = "steps",
    eval_steps                   = EVAL_STEPS,
    output_dir                   = OUTPUT_DIR,
    optim                        = "paged_adamw_8bit", 
    save_total_limit             = SAVE_TOTAL_LIMIT,
    
    report_to                    = "wandb",
    weight_decay                 = WEIGHT_DECAY,
    
    
    logging_first_step           = True,  
    max_grad_norm                = 1.0,
    dataloader_num_workers       = 4,
    
    load_best_model_at_end       = True,
    # metric_for_best_model        = "eval_loss",
    # greater_is_better            = False,
)


# Set up the trainer with validation
trainer = Trainer(
    model                     = model,
    args                      = training_args,
    train_dataset             = train_dataset,
    eval_dataset              = val_dataset,
    data_collator             = DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics           = None,  
    callbacks                 = [generation_callback],  

)


# Disable caching during training to avoid memory issues
model.config.use_cache = False

# Start training
print("Starting training...")
trainer.train()

NameError: name 'OUTPUT_DIR' is not defined

In [None]:
# Save the final model
model.save_pretrained(f"{OUTPUT_DIR}/{RUN_NAME}")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/{RUN_NAME}")

print("Training complete and model saved!")

In [None]:
def load_and_test_model():
    print("\nTesting fine-tuned model:")
    
    # Load the base model and LoRA adapter
    # base_model = GPTNeoXForCausalLM.from_pretrained(
    #     MODEL_ID,
    #     revision=MODEL_REVISION,
    #     cache_dir=CACHE_DIR,
    #     device_map="auto"
    # )
    
    # # Load and apply the fine-tuned LoRA weights
    # fine_tuned_model = PeftModel.from_pretrained(
    #     base_model, 
    #     f"{OUTPUT_DIR}/lora_model",
    #     device_map="auto"
    # )
    
    fine_tuned_model = model
    
    # Test the model with the prompts
    for test_input_string in test_prompts:
        inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
        # print(tokens[0])
        tokens = fine_tuned_model.generate(
            **inputs, 
            max_length=100, 
            pad_token_id=tokenizer.eos_token_id
        )
        print("Without sampling: " + tokenizer.decode(tokens[0], skip_special_tokens=True))
        
        print("---------------")
        tokens = fine_tuned_model.generate(
            **inputs, 
            max_length=100, 
            pad_token_id=tokenizer.eos_token_id, 
            do_sample=True,
        )
        print("With sampling   : " + tokenizer.decode(tokens[0], skip_special_tokens=True))
        
        print("\n===============")
    
load_and_test_model()

In [None]:
from src.utils import * 

article = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
summary = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_summary = "Daniel Craig is recasted as James Bond again"
# In normal, query is sentence/article, and answer is summary/highlight (S->A direction)
base = calculate_score(summary, article, model, tokenizer, backward=False, query_direction="normal", debug=True)

print(base['normalized_log_prob'], base['perplexity'])

In [None]:
base = calculate_score(summary, article, model, tokenizer, backward=False, query_direction="reverse", debug=True)

print(base['normalized_log_prob'], base['perplexity'])

In [None]:
wandb.finish()