# Fine tune with full scale dataset

## Import and utilities

In [None]:
# Append paths for the src folder
import sys
import os 
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'idl-project')))

# Additional imports 
from src.utils import *


In [None]:
from src.model import load_fo_model
from src.data import load_flan_dataset, load_summarization_datasets

In [None]:
import torch
import wandb

from datasets import Dataset, load_dataset
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from tqdm.auto import tqdm
from transformers import (AutoTokenizer, BitsAndBytesConfig,
                          DataCollatorForLanguageModeling, GPTNeoXForCausalLM,
                          Trainer, TrainingArguments, TrainerCallback)

In [None]:
current_dir = os.path.abspath(os.getcwd())

# Check if 'idl-project' is in the path
if 'idl-project' not in current_dir:
    raise Exception("Current directory '{current_dir}' is not within 'idl-project'")

print(f"✓ Working in '{current_dir}'")
print(f"✓ Directory contains 'idl-project'")

OUTPUT_DIR = "output/"

In [None]:
from notebooks.finetune_lora_config import *

In [None]:
RUN_NAME

In [None]:
!git pull

In [None]:
CACHE_DIR

## Config and Variables

In [None]:
import wandb
wandb.init(entity="11785_finetuning", project='ivan-testing-team', name=RUN_NAME, reinit=True)
wandb.save("notebooks/finetune_lora_config.py", policy="now")

In [None]:
# Check CUDA
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
# Define test prompts
test_prompts = [
    "Who is Barack Obama?",
    "What is Carnegie Mellon University?",
    "Classify this restaurant review sentiment: 'The food was absolutely delicious but the service was extremely slow and the waiter seemed uninterested in helping us.'",
    "Create a summary of the following incident: \nAt just after 1 a.m., three CMU students were walking in the 3600 block of Fifth Avenue approximately one mile from central campus when someone in a white Toyota Tacoma fired a BB pellet gun at them, striking two of the students in the arm and the third in the back, none of whom reported injuries. They did not see who was inside the vehicle and were unable to see the vehicle’s license plate.",
    "Summarize the following text: \nDr. Sarah Chen, lead scientist on the mission, called it 'the most significant discovery in the history of space exploration.' The finding suggests that Mars once had a much more hospitable environment with liquid water and possibly a thicker atmosphere. The agency plans to send a sample return mission within the next five years to bring these fossils back to Earth for more detailed analysis. This discovery has profound implications for our understanding of how life might develop throughout the universe."
]

## Dataset Loading and Preparation

In [None]:
# dataset = load_summarization_datasets(subset_names=['niv2_zsopt_data', 'cot_zsopt_data', 'dialog_zsopt_data'])
dataset = load_summarization_datasets(
    subset_names=FLAN_SUBSET, subset_frac=SUBSET_FRAC, p=TASK_DIVERSITY_P)

In [None]:
for i in range(50):
    print(i, dataset[i]['_task_name'])
    # print(dataset[i]['inputs'])
    # print(dataset[i]['targets'])
    # print()
    # print("="*80)

In [None]:
dataset[i]['inputs']

In [None]:
dataset[i]['targets']

##  LORA

### Configure BitsAndBytes for 4-bit quantization

In [None]:
# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",           # Use 4-bit NormalFloat quantization
    bnb_4bit_use_double_quant=True,      # Use double quantization for additional memory savings
    bnb_4bit_compute_dtype=torch.float32  # Compute in float32 (can also use torch.bfloat16 if available)
)


In [None]:
MODEL_ID       = "EleutherAI/pythia-160m-deduped"
MODEL_REVISION = "step143000"

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    revision=MODEL_REVISION,
    cache_dir=CACHE_DIR
)

# Ensure the tokenizer has padding token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


if FT_TYPE == 'full':
    model = GPTNeoXForCausalLM.from_pretrained(
        MODEL_ID,
        revision=MODEL_REVISION,
        cache_dir=CACHE_DIR,
        # quantization_config=bnb_config,
        device_map="auto"  # Automatically distribute layers across available GPUs
    )
else:
    # Load the model with quantization
    model = GPTNeoXForCausalLM.from_pretrained(
        MODEL_ID,
        revision=MODEL_REVISION,
        cache_dir=CACHE_DIR,
        quantization_config=bnb_config,
        device_map="auto"  # Automatically distribute layers across available GPUs
    )

# Prepare the model for k-bit training
# model = prepare_model_for_kbit_training(model)

In [None]:
# Print trainable parameters information
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [None]:
if FT_TYPE == 'lora': 
    # Define the LoRA configuration
    # For Pythia models, the target module is "query_key_value" for attention layers
    lora_config = LoraConfig(
        r=LORA_RANK,                    # Rank dimension
        lora_alpha=LORA_ALPHA,          # LoRA scaling factor
        target_modules=TARGET_MODULES,  # Target specific attention modules
        lora_dropout=LORA_DROPOUT,      # Dropout probability for LoRA layers
        bias="none",            # Don't apply LoRA to bias terms
        task_type="CAUSAL_LM"   # Task type for causal language modeling
    )

    # Apply LoRA to the model
    model = get_peft_model(model, lora_config)
    
    print_trainable_parameters(model)

##  Dataset Formatting and Tokenization

In [None]:
# Function to format the dataset for causal language modeling
def format_for_clm(examples):
    # Format as: "Instruction: {instruction} Input: {input} Output: {output}"
    # Adjust this format based on your specific dataset structure
    if 'inputs' in examples and 'targets' in examples:
        texts = [
            f"{inp}\n{target}{tokenizer.eos_token}"
            for inp, target in zip(examples['inputs'], examples['targets'])
        ]
    else:
        # Fallback for other dataset structures
        texts = examples['text'] if 'text' in examples else []
    
    return {"text": texts}

# Apply formatting 
if isinstance(dataset, Dataset):
    # For non-streaming datasets
    if 'inputs' in dataset.column_names and 'targets' in dataset.column_names:
        dataset = dataset.map(format_for_clm, batched=True, num_proc=16)
else:
    # For streaming datasets, we need to format each example as it comes
    dataset = dataset.map(lambda example: {
        'text': f"Instruction: {example['inputs']}\nOutput: {example['targets']}" 
        if 'inputs' in example and 'targets' in example 
        else example.get('text', '')
    })

In [None]:
# Tokenize function for the dataset
def tokenize_function(example):
    # Handle single examples for streaming datasets
    text = example["text"] if "text" in example else ""
    
    # Tokenize with padding and truncation
    outputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=1024,  # Adjust based on your needs and GPU memory
        return_tensors="pt"
    )
    
    # Remove the batch dimension for single examples
    for key in outputs:
        if isinstance(outputs[key], torch.Tensor) and outputs[key].ndim > 1:
            outputs[key] = outputs[key].squeeze(0)
    
    # Set labels equal to input_ids for causal language modeling
    outputs["labels"] = outputs["input_ids"].clone()
    return outputs

# Apply tokenization to the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, num_proc=8) # Can't use tokenization batched here for some reason

In [None]:
print(f"Length of training dataset: {len(tokenized_dataset)}")

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=TEST_SIZE)
train_dataset = split_dataset['train']
val_dataset   = split_dataset['test']  # Note: called 'test' by default

In [None]:
print("Dataset summary")
print("-"*80)
print("-"*80)

print(f"   Length of training dataset    : {len(train_dataset)}")
print(f"   Length of validation dataset  : {len(val_dataset)}")
print("-"*80)

print(f"   Batch Size: {BATCH_SIZE}")
print(f"   Accumulation Steps: {GRADIENT_ACCUMULATION_STEPS}")
print("-"*80)

print(f"   Expected total training steps : {len(train_dataset) // BATCH_SIZE // GRADIENT_ACCUMULATION_STEPS}")
print(f"   Expected validation steps     : {len(val_dataset) // BATCH_SIZE // GRADIENT_ACCUMULATION_STEPS}")
print("-"*80)

In [None]:
# # Since we're using streaming datasets, convert to list for training
# # This is needed because Trainer expects a non-streaming dataset
# # We'll create a buffer of examples for training
# buffer_size = 25000  # Adjust based on your memory constraints
# tokenized_examples = []
# for example in tqdm(tokenized_dataset, total=buffer_size):
#     tokenized_examples.append(example)
#     if len(tokenized_examples) >= buffer_size:
#         break

# print(f"Collected {len(tokenized_examples)} examples for training")

In [None]:
# # Convert to Dataset object for training
# from datasets import Dataset as HFDataset
# train_dataset = HFDataset.from_list(tokenized_examples)

# print(f"Training dataset created with columns: {train_dataset.column_names}")
# print(f"Number of examples: {len(train_dataset)}")

In [None]:
# Generation test callback
class GenerationTestCallback(TrainerCallback):
    """
    Callback to generate text samples at evaluation steps.
    """
    def __init__(self, tokenizer, test_prompts, max_length=250, do_sample=True, 
                 num_beams=2, temperature=0.0, repetition_penalty=1.001, device="cuda"):
        """
        Initialize with tokenizer and test prompts.
        """
        self.tokenizer    = tokenizer
        self.test_prompts = test_prompts
        self.device       = device
        self.max_length   = max_length
        self.do_sample    = do_sample
        self.num_beams    = num_beams
        self.temperature  = temperature
        self.repetition_penalty = repetition_penalty
            
            
    def on_evaluate(self, args, state, control, model, **kwargs):
        """
        Run after each evaluation to generate two types of samples:
        1. Free-form completion with sampling
        2. Greedy decoding for deterministic output
        """
        print("GenerationTestCallback")
        model.eval()  # Set model to evaluation mode
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print("\n" + "="*50)
        print(f"Generating samples at step {state.global_step}:")
        print("="*50)
        
        with torch.no_grad():
            for prompt in self.test_prompts:
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                
                # 1. Generate with sampling
                sample_output_ids = model.generate(
                    inputs["input_ids"],
                    attention_mask    = inputs["attention_mask"],
                    max_length        = self.max_length,
                    do_sample         = True,
                    repetition_penalty = self.repetition_penalty,
                    no_repeat_ngram_size = 3,
                    # pad_token_id      = self.tokenizer.pad_token_id,
                    # eos_token_id      = self.tokenizer.eos_token_id
                )
                
                # 2. Generate with greedy decoding (deterministic) and no sampling
                greedy_output_ids = model.generate(
                    inputs["input_ids"],
                    attention_mask    = inputs["attention_mask"],
                    max_length        = self.max_length,
                    do_sample         = False,
                    temperature       = self.temperature,
                    repetition_penalty = self.repetition_penalty,
                    no_repeat_ngram_size = 3,
                    # pad_token_id      = self.tokenizer.eos_token_id,
                    # pad_token_id      = self.tokenizer.pad_token_id,
                    # eos_token_id      = self.tokenizer.eos_token_id
                )
                
                # Decode both outputs
                sampled_text = self.tokenizer.decode(sample_output_ids[0], skip_special_tokens=True)
                greedy_text  = self.tokenizer.decode(greedy_output_ids[0], skip_special_tokens=True)
                
                # Print the results
                print(f"\nPrompt: {prompt}")
                print(f"Greedy: {greedy_text}")
                print("-"*50)
                
                # Log to W&B if you're using it
                if args.report_to == "wandb":
                    import wandb
                    wandb.log({
                        f"generation/{prompt}/sampled": wandb.Html(
                            f"<b>Step {state.global_step}</b><br>"
                            f"<p><b>Prompt:</b> {prompt}</p>"
                            f"<p><b>Sampled:</b> {sampled_text}</p>"
                        ),
                        f"generation/{prompt}/greedy": wandb.Html(
                            f"<b>Step {state.global_step}</b><br>"
                            f"<p><b>Prompt:</b> {prompt}</p>"
                            f"<p><b>Greedy:</b> {greedy_text}</p>"
                        )
                    }, step=state.global_step)
        
        return control

# Create generation callback
generation_callback = GenerationTestCallback(
    tokenizer     = tokenizer,
    test_prompts  = test_prompts,
    max_length    = 250,
    num_beams     = 3,
    temperature   = 0.1,
)

In [None]:
print(f"eos_token_id: {tokenizer.eos_token_id}, eos_token = {tokenizer.eos_token}")
print(f"pad_token_id: {tokenizer.pad_token_id}, pad_token = {tokenizer.pad_token}")

In [None]:
def load_and_test_model():
    print("\nTesting model:")
    
    # Load the base model and LoRA adapter
    # base_model = GPTNeoXForCausalLM.from_pretrained(
    #     MODEL_ID,
    #     revision=MODEL_REVISION,
    #     cache_dir=CACHE_DIR,
    #     device_map="auto"
    # )
    
    # # Load and apply the fine-tuned LoRA weights
    # fine_tuned_model = PeftModel.from_pretrained(
    #     base_model, 
    #     f"{OUTPUT_DIR}/lora_model",
    #     device_map="auto"
    # )
    
    fine_tuned_model = model
    
    # Test the model with the prompts
    for test_input_string in test_prompts:
        inputs = tokenizer(test_input_string, return_tensors="pt").to(DEVICE)
        # print(tokens[0])
        tokens = fine_tuned_model.generate(
            **inputs, 
            repetition_penalty=1.001,
            max_length=250, 
            no_repeat_ngram_size = 3,
            # eos_token_id=tokenizer.eos_token_id,
            # pad_token_id=tokenizer.pad_token_id,
        )
        print("Without sampling: " + tokenizer.decode(tokens[0], skip_special_tokens=True))
        
        print("---------------")
        tokens = fine_tuned_model.generate(
            **inputs, 
            repetition_penalty=1.001,
            max_length=250, 
            no_repeat_ngram_size = 3,
            do_sample=True,
            # eos_token_id=tokenizer.eos_token_id,
            # pad_token_id=tokenizer.pad_token_id,
        )
        print("With sampling   : " + tokenizer.decode(tokens[0], skip_special_tokens=True))
        
        print("\n===============")
    
load_and_test_model()

## Train

In [None]:
# Set format to PyTorch
train_dataset.set_format(type="torch")
val_dataset.set_format(type="torch")

# Create training arguments with parameters
training_args = TrainingArguments(
    per_device_train_batch_size  = BATCH_SIZE,
    per_device_eval_batch_size   = BATCH_SIZE,
    gradient_accumulation_steps  = GRADIENT_ACCUMULATION_STEPS,
    warmup_ratio                 = WARMUP_RATIO,
    num_train_epochs             = NUM_EPOCHS,
    learning_rate                = LEARNING_RATE,
    lr_scheduler_type            = LR_SCHEDULER_TYPE,
    fp16                         = FP16,
    logging_steps                = LOGGING_STEPS,
    save_steps                   = SAVE_STEPS,
    eval_strategy                = "steps",
    eval_steps                   = EVAL_STEPS,
    output_dir                   = OUTPUT_DIR,
    optim                        = "paged_adamw_8bit", 
    save_total_limit             = SAVE_TOTAL_LIMIT,
    
    report_to                    = "wandb",
    weight_decay                 = WEIGHT_DECAY,
    
    
    logging_first_step           = True,  
    max_grad_norm                = 1.0,
    dataloader_num_workers       = 4,
    
    load_best_model_at_end       = True,
    # metric_for_best_model        = "eval_loss",
    # greater_is_better            = False,
)


# Set up the trainer with validation
trainer = Trainer(
    model                     = model,
    args                      = training_args,
    train_dataset             = train_dataset,
    eval_dataset              = val_dataset,
    data_collator             = DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics           = None,  
    callbacks                 = [generation_callback],  

)


# Disable caching during training to avoid memory issues
model.config.use_cache = False

# Start training
print("Starting training...")
trainer.train()

In [None]:
# Save the final model
model.save_pretrained(f"{OUTPUT_DIR}/{RUN_NAME}")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/{RUN_NAME}")

print("Training complete and model saved!")

In [None]:
from src.utils import * 

article = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
summary = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_summary = "Daniel Craig is recasted as James Bond again"
# In normal, query is sentence/article, and answer is summary/highlight (S->A direction)
base = calculate_score(summary, article, model, tokenizer, backward=False, query_direction="normal", debug=True)

print(base['normalized_log_prob'], base['perplexity'])

In [None]:
base = calculate_score(summary, article, model, tokenizer, backward=False, query_direction="reverse", debug=True)

print(base['normalized_log_prob'], base['perplexity'])

In [None]:
wandb.finish()