In [1]:
import torch
import torchvision
import transformers
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,  # Add this import
    PeftModel
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [3]:
dataset = load_dataset("mandarjoshi/trivia_qa", "rc.nocontext")

In [4]:
64*256*8

131072

In [5]:
class TrainingConfig:
    # Reduce these values
    OUTPUT_DIR = "./results/"
    MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
    BATCH_SIZE = 32  # Reduced from 8
    MAX_LENGTH = 256  # Reduced from 512
    GRADIENT_ACCUMULATION_STEPS = 16  # Increase this to compensate for smaller batch size
    LEARNING_RATE = 3e-4
    NUM_EPOCHS = 3
    
    # LoRA config
    LORA_R = 8
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.1
    
    @classmethod
    def estimate_memory_usage(cls):
        """Estimate memory usage based on config"""
        bytes_per_token = 2  # for fp16
        tokens_per_batch = cls.BATCH_SIZE * cls.MAX_LENGTH
        print("Tokens per batch:", tokens_per_batch)
        
        estimated_gpu_memory = bytes_per_token*tokens_per_batch*4
        
        print(f"Estimated GPU memory usage: {estimated_gpu_memory:.2f} Bytes")
        return 1
    
    
class DatasetProcessor:
    """Handles dataset loading and processing"""
    
    @staticmethod
    def process_dataset(tokenizer):
        print("Loading TriviaQA dataset...")
        dataset = load_dataset("trivia_qa", "rc.nocontext")
        
        def format_example(example):
            """Formats each example into a question-answer pair"""
            # print(example)
            # print(example['answer'])
            # print(example['answer']['value'])
            # print("________")
            question = example['question']
            answer = example['answer']['value']
            return f"Question: {question}\nAnswer: {answer}"
        
        def tokenize_function(examples):
            """
            Tokenizes the formatted text and sets up labels for causal language modeling.
            The labels should be the same as input_ids for causal LM training.
            """
            # print(examples)
            texts = [f"question: {q}\n answer: {a['value']}" for q, a in zip(examples['question'], examples['answer'])]
            
            # Tokenize inputs
            tokenized = tokenizer(
                texts,
                truncation=True,
                max_length=TrainingConfig.MAX_LENGTH,
                padding="max_length",
                return_tensors="pt"
            )
            
            # Set up labels for causal language modeling
            tokenized['labels'] = tokenized['input_ids'].clone()
            
            return tokenized
        
        print("Processing training data...")
        train_dataset = dataset['train'].map(
            tokenize_function,
            batched=True,
            remove_columns=dataset['train'].column_names
        )
        
        print("Processing validation data...")
        val_dataset = dataset['validation'].map(
            tokenize_function,
            batched=True,
            remove_columns=dataset['validation'].column_names
        )
        
        # Verify the dataset has the required keys
        print("Dataset keys:", train_dataset.column_names)  # Should include 'input_ids', 'attention_mask', and 'labels'
        
        return train_dataset, val_dataset

In [6]:
class TrainingSetup:
    """Handles training configuration and trainer setup"""
    
    @staticmethod
    def setup_training(model, train_dataset, val_dataset, tokenizer):
        training_args = transformers.TrainingArguments(
            output_dir=TrainingConfig.OUTPUT_DIR,
            learning_rate=TrainingConfig.LEARNING_RATE,
            num_train_epochs=TrainingConfig.NUM_EPOCHS,
            per_device_train_batch_size=TrainingConfig.BATCH_SIZE,
            per_device_eval_batch_size=TrainingConfig.BATCH_SIZE,
            gradient_accumulation_steps=TrainingConfig.GRADIENT_ACCUMULATION_STEPS,
            gradient_checkpointing=True,
            evaluation_strategy="steps",
            eval_steps=500,
            save_strategy="steps",
            save_steps=500,
            save_total_limit=3,
            load_best_model_at_end=True,
            logging_dir=f"{TrainingConfig.OUTPUT_DIR}/logs",
            logging_steps=100,
            # fp16=True,
        )
        
        # Use a custom data collator that handles labels properly
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False  # We're doing causal language modeling, not masked
        )
        
        trainer = transformers.Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator
        )
        
        return trainer


In [7]:
def verify_dataset(dataset, tokenizer):
    """Utility function to verify the dataset is properly formatted"""
    print("\nVerifying dataset format:")
    example = dataset[0]
    print("Dataset keys:", example.keys())
    print("Input shape:", len(example['input_ids']))
    print("Label shape:", len(example['labels']))
    
    # Decode an example to verify the format
    print("\nExample text:")
    print("Input:", tokenizer.decode(example['input_ids']))
    print("Labels:", tokenizer.decode(example['labels']))

In [8]:
class ModelLoader:
    """Handles loading and preparation of the model and tokenizer"""
    
    @staticmethod
    def load_model_and_tokenizer():
        """
        Loads the Qwen model and tokenizer with appropriate configurations.
        - Uses 8-bit quantization to reduce memory usage
        - Sets up proper tokenizer configuration
        """
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            TrainingConfig.MODEL_NAME, 
            trust_remote_code=True
        )
        # Ensure proper padding token is set
        tokenizer.pad_token = tokenizer.eos_token
        
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        )
        print("Loading model in 8-bit precision...")
        model = AutoModelForCausalLM.from_pretrained(
            TrainingConfig.MODEL_NAME,
            trust_remote_code=True,
            # quantization_config=quantization_config,  # 8-bit quantization for memory efficiency
            device_map="auto",    # Automatically handle model placement on available devices
            torch_dtype=torch.float16
        )
        
        return model, tokenizer

In [9]:
class LoRAPreparation:
    """Handles LoRA adapter configuration and setup"""
    
    @staticmethod
    def prepare_lora_model(model):
        """
        Configures and applies LoRA adapter to the base model.
        Explains which layers are being modified and how many parameters are trainable.
        """
        print("Configuring LoRA adapter...")
        lora_config = LoraConfig(
            r=TrainingConfig.LORA_R,
            lora_alpha=TrainingConfig.LORA_ALPHA,
            # Target the attention layers for adaptation
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
            lora_dropout=TrainingConfig.LORA_DROPOUT,
            bias="none",
            task_type=TaskType.CAUSAL_LM
        )
        
        # Prepare model for k-bit training
        print("Preparing model for training...")
        model = prepare_model_for_kbit_training(model)
        model.gradient_checkpointing_enable()
        model.enable_input_require_grads() 
        # Apply LoRA adapter
        model = get_peft_model(model, lora_config)
        
        # Print trainable parameters information
        model.print_trainable_parameters()
        
        return model

In [10]:
def verify_model_setup(model):
    """Verify model is properly set up for training"""
    print("\nVerifying model setup:")
    
    # Check if any parameters require gradients
    has_grad_params = any(p.requires_grad for p in model.parameters())
    print(f"Has parameters requiring gradients: {has_grad_params}")
    
    # Print trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params/all_params:.2%} of total)")
    
    # Check model device
    print(f"Model device: {next(model.parameters()).device}")
    
    return has_grad_params

In [11]:
# Load model and tokenizer
model, tokenizer = ModelLoader.load_model_and_tokenizer()
model = LoRAPreparation.prepare_lora_model(model)
model = model.half()

# Process dataset
train_dataset, val_dataset = DatasetProcessor.process_dataset(tokenizer)

# Verify dataset format
verify_model_setup(model)
verify_dataset(train_dataset, tokenizer)
TrainingConfig.estimate_memory_usage()

# Setup training
trainer = TrainingSetup.setup_training(model, train_dataset, val_dataset, tokenizer)

# Train model
print("Starting training...")
trainer.train()

# Save trained model
trainer.save_model(TrainingConfig.OUTPUT_DIR)
model.save_pretrained(f"{TrainingConfig.OUTPUT_DIR}/adapter")


Loading tokenizer...
Loading model in 8-bit precision...
Configuring LoRA adapter...
Preparing model for training...
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 1,081,344 || all params: 495,114,112 || trainable%: 0.2184
Loading TriviaQA dataset...


  warn("The installed version of bitsandbytes was compiled without GPU support. "


Processing training data...
Processing validation data...


Map: 100%|██████████| 17944/17944 [00:01<00:00, 11268.44 examples/s]


Dataset keys: ['input_ids', 'attention_mask', 'labels']

Verifying model setup:
Has parameters requiring gradients: True
Trainable parameters: 1,081,344 (0.22% of total)
Model device: mps:0

Verifying dataset format:
Dataset keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input shape: 256
Label shape: 256

Example text:
Input: question: Which American-born Sinclair won the Nobel Prize for Literature in 1930?
 answer: Sinclair Lewis<|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 