In [None]:
# Install dependencies

!pip install -q datasets
!pip install -q "transformers>=4.41.0,<5.0.0"
!pip install -q accelerate==0.26.0
!pip install -q peft
!pip install -q bitsandbytes
!pip install -q trl

In [None]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from huggingface_hub import login, HfFolder, create_repo, whoami
import glob
from dataclasses import dataclass, field
from typing import List
import json
from datetime import datetime
from google.colab import userdata
import accelerate

@dataclass
class Config:
    # Model settings
    base_model: str = "mistralai/Mistral-7B-Instruct-v0.3"
    cache_dir: str = "/content/model_cache"
    output_dir: str = "/content/model_outputs"

    # HuggingFace settings
    hf_repo_name: str = None  # Will be set during runtime with username prefix
    push_to_hub: bool = True
    private_repo: bool = True

    # Run identification
    run_name: str = f"blocktail-mistral-{datetime.now().strftime('%Y%m%d_%H%M')}"

    # LoRA settings
    r: int = 32
    lora_alpha: int = 64
    target_modules: List[str] = field(default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"])

    # Training settings
    max_length: int = 4096
    learning_rate: float = 1e-4
    num_train_epochs: int = 5
    per_device_train_batch_size: int = 4
    gradient_accumulation_steps: int = 4
    warmup_ratio: float = 0.1

    # Quantization & Performance
    load_in_4bit: bool = True
    use_nested_quant: bool = True
    bnb_4bit_compute_dtype: str = "float16"

def setup_environment():
    """Setup environment, directories, and HF repo"""
    config = Config()

    # Create cache directory
    os.makedirs(config.cache_dir, exist_ok=True)

    # Create output directory
    os.makedirs(config.output_dir, exist_ok=True)

    # Setup HF token and create private repo
    hf_token = userdata.get('HF_TOKEN')
    if hf_token:
        login(token=hf_token)
        HfFolder.save_token(hf_token)
        print("HuggingFace token loaded and cached")

        try:
            # Get username and set repo name
            user_info = whoami()
            config.hf_repo_name = f"{user_info['name']}/blocktail-mistral"
            print(f"Will create repo as: {config.hf_repo_name}")

            # Check write permissions
            if not user_info.get('can_create_repo', False):
                print("User does not have repository creation permissions")
                config.push_to_hub = False
            elif config.push_to_hub:
                try:
                    create_repo(
                        config.hf_repo_name,
                        private=config.private_repo,
                        exist_ok=True
                    )
                    print(f"HuggingFace repository '{config.hf_repo_name}' ready")
                except Exception as e:
                    print(f"Could not create repo: {e}")
                    print("Training will continue but model won't be pushed to HF Hub")
                    config.push_to_hub = False
        except Exception as e:
            print(f"Could not setup HF Hub integration: {e}")
            config.push_to_hub = False
    else:
        raise ValueError("HF_TOKEN not found in Colab secrets")

    return config

def load_and_process_data():
    """Load and combine all JSONL files"""
    # Look for JSONL files in the data directory
    jsonl_files = glob.glob('data/*.jsonl')  # Changed path to look in data/
    if not jsonl_files:
        raise FileNotFoundError("No JSONL files found in data/ directory")

    combined_data = []
    for file in jsonl_files:
        with open(file, 'r') as f:
            data = [json.loads(line) for line in f]
            combined_data.extend(data)

    print(f"Loaded {len(combined_data)} examples from {len(jsonl_files)} files")
    # Fix: Changed how we create the dataset
    dataset = Dataset.from_list(combined_data)
    return dataset

def process_dataset(dataset, tokenizer, max_length):
    """Process and tokenize the dataset"""
    def format_text(prompt, completion):
        return f"### Question: {prompt}\n\n### Answer: {completion}\n\n### End"

    def tokenize_function(examples):
        texts = [
            format_text(prompt, completion)
            for prompt, completion in zip(examples["prompt"], examples["completion"])
        ]

        tokenized = tokenizer(
            texts,
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors="pt"
        )

        # Set labels same as input_ids for causal language modeling
        tokenized["labels"] = tokenized["input_ids"].clone()

        return tokenized

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )

    print(f"Dataset processed: {len(tokenized_dataset)} examples")
    return tokenized_dataset

def main():
    # Setup environment and config
    config = setup_environment()
    print("Starting training pipeline...")

    # Quantization config for QLoRA:
    # - load_in_4bit=True enables 4-bit weight quantization (set above)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=config.load_in_4bit,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=getattr(torch, config.bnb_4bit_compute_dtype),
        bnb_4bit_use_double_quant=config.use_nested_quant,
    )

    # Load model and tokenizer with caching
    print(f"Loading model from {config.base_model}...")
    model = AutoModelForCausalLM.from_pretrained(
        config.base_model,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        cache_dir=config.cache_dir,
        torch_dtype=torch.float16  # Add explicit dtype
    )

    tokenizer = AutoTokenizer.from_pretrained(
        config.base_model,
        cache_dir=config.cache_dir
    )
    tokenizer.pad_token = tokenizer.eos_token
    print("Model and tokenizer loaded")

    # Prepare the model for training
    model.config.use_cache = False  # Important for training

    # LoRA config and application
    print("Applying LoRA configuration...")
    lora_config = LoraConfig(
        r=config.r,
        lora_alpha=config.lora_alpha,
        target_modules=config.target_modules,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_config)

    # Prepare model for training
    model.enable_input_require_grads()
    model.gradient_checkpointing_enable()

    # Print trainable parameters
    model.print_trainable_parameters()

    print("LoRA applied")

    # Load and process dataset
    dataset = load_and_process_data()
    tokenized_dataset = process_dataset(dataset, tokenizer, config.max_length)

    # Training arguments with HF integration
    training_args = TrainingArguments(
        output_dir=os.path.join(config.output_dir, config.run_name),
        num_train_epochs=config.num_train_epochs,
        per_device_train_batch_size=config.per_device_train_batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        learning_rate=config.learning_rate,
        warmup_ratio=config.warmup_ratio,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=1,
        bf16=True,  # Use bfloat16 for A100
        gradient_checkpointing=True,
        report_to="tensorboard",
        run_name=config.run_name,
        # HuggingFace Hub settings
        push_to_hub=config.push_to_hub,
        hub_model_id=config.hf_repo_name,
        hub_private_repo=config.private_repo,
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    # Train
    print(f"Starting training... Model will be pushed to: {config.hf_repo_name}")
    trainer.train()

    # Save and push final model
    if config.push_to_hub:
        print(f"Pushing final model to HuggingFace Hub: {config.hf_repo_name}")
        trainer.push_to_hub(
            repo_id=config.hf_repo_name,
            commit_message=f"Final model - {config.run_name}",
            private=config.private_repo
        )

    # Also save locally as backup
    final_output_dir = os.path.join(config.output_dir, config.run_name, "final_model")
    print(f"Saving backup to {final_output_dir}")
    trainer.save_model(final_output_dir)
    tokenizer.save_pretrained(final_output_dir)

    # Save training config
    config_path = os.path.join(final_output_dir, "training_config.json")
    with open(config_path, "w") as f:
        json.dump(config.__dict__, f, indent=2)

    print("Training complete!")
    print(f"Model available at: https://huggingface.co/{config.hf_repo_name}")
    print(f"Local backup at: {final_output_dir}")

if __name__ == "__main__":
    main()


## Use LoRA for inference (Optional)

In [None]:
# Execute to prevent VRAM exhaustion (Good practice)
import gc
import torch

# Safe deletion of model instances
try:
    del model
    print("Deleted existing model from memory!")
except NameError:
    print("No existing 'model' found in memory.")

try:
    del base_model
    print("Deleted existing base model from memory!")
except NameError:
    print("No existing 'base_model' found in memory.")

# Release CUDA memory and trigger garbage collection
torch.cuda.empty_cache()  # Free unused CUDA cache
gc.collect()  # Force Python garbage collection

print("Memory cleared and GPU cache emptied!")

In [None]:
# Inference pipeline with LoRA weights
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load foundation model - update path as needed
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    torch_dtype="auto",
    device_map="auto"
)

# Initialize tokenizer from same foundation model
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

# Update path to your trained LoRA weights
lora_path = "/content/model_outputs/blocktail-mistral-20250222_2323/final_model"
model = PeftModel.from_pretrained(base_model, lora_path)

# Set model to inference mode
model.eval()

print("LoRA model loaded successfully for inference!")

In [None]:
# Let's test our fine-tuned model! This cell helps us chat with it and see how well it learned.

def generate_response(prompt, max_tokens=100):
    """Generate a response using the fine-tuned LoRA model."""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=max_tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example Prompt
prompt = "What are Blocktail Layers?"
response = generate_response(prompt)
print("Model Response:\n", response)

## Merge LoRA into Base Model (Optional)

In [None]:
# Execute to prevent VRAM exhaustion (Good practice)
import gc
import torch

# Safe deletion of model instances
try:
    del model
    print("Deleted existing model from memory!")
except NameError:
    print("No existing 'model' found in memory.")

try:
    del base_model
    print("Deleted existing base model from memory!")
except NameError:
    print("No existing 'base_model' found in memory.")

# Release CUDA memory and trigger garbage collection
torch.cuda.empty_cache()  # Free unused CUDA cache
gc.collect()  # Force Python garbage collection

print("Memory cleared and GPU cache emptied!")

In [None]:
# Optional: Merge LoRA weights into base model for deployment without PEFT dependency

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Configure paths - update these for your environment
base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"  # Foundation model
lora_model_path = "/content/model_outputs/blocktail-mistral-20250222_2323/final_model"  # Trained LoRA
merged_model_path = "/content/merged_mistral_7b"  # Output path for merged model

# Initialize base model in half precision
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,  # FP16 for memory efficiency
    device_map="auto"  # Automatic device allocation
)

# Load associated tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load trained LoRA adapter
model = PeftModel.from_pretrained(base_model, lora_model_path)

# Perform weight merging operation
merged_model = model.merge_and_unload()

# Save merged model and tokenizer
merged_model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)

print(f"Model merged and saved at {merged_model_path}")

In [None]:
# Let's verify our merged model works correctly! Testing inference without PEFT/LoRA dependencies.

# Load Merged Model (No LoRA Required)
model = AutoModelForCausalLM.from_pretrained(
    merged_model_path,  # Path to your merged model
    torch_dtype=torch.float16,  # FP16 for efficiency
    device_map="auto"  # Automatic device allocation
)

# Inference function for testing
def generate_response(prompt, max_tokens=150):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=max_tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Run a test prompt
prompt = "What is l5 in Blocktail?"
response = generate_response(prompt)
print("Model Response:\n", response)