# üöÄ Automated LoRA Training Pipeline

This notebook automatically trains a LoRA adapter for your selected model using your custom dataset.

**Steps:**
1. Install required packages
2. Load your dataset from the backend
3. Configure LoRA parameters
4. Train the model
5. Save and upload the trained adapter

In [None]:
# Install required packages
!pip install -q transformers datasets peft accelerate bitsandbytes trl einops requests

## ‚öôÔ∏è Configuration

**Important:** If you clicked the Colab link from the training interface, the job parameters should be automatically configured. Otherwise, you'll need to set them manually in the next cell.

You can also manually edit `JOB_ID` and `MODEL_NAME` in the configuration cell if needed.

In [None]:
# üìù Manual Configuration (Optional)
# If you're running this manually, update these values:

# Paste your Job ID from the training interface (or leave as auto-generated)
MANUAL_JOB_ID = ""  # Example: "550e8400-e29b-41d4-a716-446655440000"

# Select your model (must match one of the available models)
MANUAL_MODEL = "tinyllama-1.1b"  # Options: llama-2-7b, llama-2-13b, mistral-7b, phi-2, gemma-7b, tinyllama-1.1b

# StarCoder Dataset Configuration (leave empty if using CSV upload)
MANUAL_USE_STARCODER = False  # Set to True to use StarCoder dataset
MANUAL_STARCODER_LANGUAGE = "python"  # Programming language (python, javascript, java, etc.)
MANUAL_STARCODER_MAX_SAMPLES = 10000  # Maximum number of samples to use

# If manual values are set, they will override auto-detection
if MANUAL_JOB_ID:
    print(f"‚úì Using manual Job ID: {MANUAL_JOB_ID}")
if MANUAL_MODEL:
    print(f"‚úì Using manual model: {MANUAL_MODEL}")
if MANUAL_USE_STARCODER:
    print(f"‚úì Using StarCoder dataset: {MANUAL_STARCODER_LANGUAGE} (max {MANUAL_STARCODER_MAX_SAMPLES} samples)")

In [None]:
import os
import requests
import pandas as pd
from datetime import datetime
import sys

# Try to parse URL parameters from Colab
try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

# Will be set from job status API call
USE_STARCODER = False
STARCODER_LANGUAGE = None
STARCODER_MAX_SAMPLES = 10000

# Priority: Manual values > URL parameters > Environment variables > Auto-generated
if 'MANUAL_JOB_ID' in globals() and MANUAL_JOB_ID:
    JOB_ID = MANUAL_JOB_ID
elif 'JOB_ID' in os.environ:
    JOB_ID = os.environ['JOB_ID']
else:
    JOB_ID = ('colab' if IN_COLAB else 'local') + '-training-' + datetime.now().strftime('%Y%m%d-%H%M%S')

if 'MANUAL_MODEL' in globals() and MANUAL_MODEL:
    MODEL_NAME = MANUAL_MODEL
elif 'MODEL_NAME' in os.environ:
    MODEL_NAME = os.environ['MODEL_NAME']
else:
    MODEL_NAME = "tinyllama-1.1b"  # Default to smallest model for testing

# StarCoder configuration - check manual config first
if 'MANUAL_USE_STARCODER' in globals() and MANUAL_USE_STARCODER:
    USE_STARCODER = True
    STARCODER_LANGUAGE = MANUAL_STARCODER_LANGUAGE
    STARCODER_MAX_SAMPLES = MANUAL_STARCODER_MAX_SAMPLES

API_URL = "https://slmllm-backend.vercel.app"

# Try to fetch job configuration from backend (includes StarCoder params)
if JOB_ID and JOB_ID.startswith(('colab', 'local')):
    # Auto-generated ID, skip API call
    pass
else:
    try:
        response = requests.get(f"{API_URL}/api/train/status/{JOB_ID}")
        if response.status_code == 200:
            job_config = response.json()
            # Update model if not manually set
            if 'MANUAL_MODEL' not in globals() or not MANUAL_MODEL:
                if 'model' in job_config:
                    MODEL_NAME = job_config['model']
            # Check for StarCoder configuration
            if job_config.get('dataset_type') == 'starcoder':
                USE_STARCODER = True
                STARCODER_LANGUAGE = job_config.get('starcoder_language', 'python')
                STARCODER_MAX_SAMPLES = job_config.get('starcoder_max_samples', 10000)
                print(f"‚úì Loaded StarCoder config from job: {STARCODER_LANGUAGE} ({STARCODER_MAX_SAMPLES} samples)")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not fetch job config from backend: {e}")
        print("   Using manual/environment configuration instead")

print(f"üîß Configuration:")
print(f"Job ID: {JOB_ID}")
print(f"Model: {MODEL_NAME}")
print(f"API URL: {API_URL}")
print(f"Environment: {'Google Colab' if IN_COLAB else 'Local'}")
if USE_STARCODER:
    print(f"üìä Dataset: StarCoder ({STARCODER_LANGUAGE}, max {STARCODER_MAX_SAMPLES} samples)")
else:
    print(f"üìä Dataset: CSV upload from backend")
print("\n‚úÖ Configuration complete! Continue to the next cell.")

In [None]:
# Model mapping
MODEL_MAP = {
    "llama-2-7b": "meta-llama/Llama-2-7b-hf",
    "llama-2-13b": "meta-llama/Llama-2-13b-hf",
    "mistral-7b": "mistralai/Mistral-7B-v0.1",
    "phi-2": "microsoft/phi-2",
    "gemma-7b": "google/gemma-7b",
    "tinyllama-1.1b": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
}

model_id = MODEL_MAP.get(MODEL_NAME, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
print(f"Using model: {model_id}")

In [None]:
# Function to update training status
def update_status(status, progress=0, error=None):
    try:
        data = {
            "status": status,
            "progress": progress
        }
        if error:
            data["error"] = error
        
        response = requests.post(
            f"{API_URL}/api/train/update/{JOB_ID}",
            data=data
        )
        print(f"Status update: {status} ({progress}%)")
    except Exception as e:
        print(f"Failed to update status: {e}")

# Update status to training
update_status("training", 5)

In [None]:
# Load dataset - either StarCoder from Hugging Face or CSV from backend
if USE_STARCODER and STARCODER_LANGUAGE:
    print(f"üì• Loading StarCoder dataset from Hugging Face...")
    print(f"Language: {STARCODER_LANGUAGE}, Max Samples: {STARCODER_MAX_SAMPLES}")
    
    try:
        from datasets import load_dataset
        
        # Load StarCoder dataset for the selected language
        print(f"Loading bigcode/starcoderdata dataset (this may take a few minutes)...")
        dataset = load_dataset("bigcode/starcoderdata", data_dir=STARCODER_LANGUAGE, split="train")
        
        # Limit to max_samples
        if len(dataset) > STARCODER_MAX_SAMPLES:
            print(f"Limiting dataset from {len(dataset)} to {STARCODER_MAX_SAMPLES} samples...")
            dataset = dataset.select(range(STARCODER_MAX_SAMPLES))
        
        print(f"‚úÖ Loaded {len(dataset)} code samples")
        
        # Convert StarCoder format to training format
        # StarCoder has 'content' field with code, we'll create instruction-response pairs
        print("Converting StarCoder format to training format...")
        
        def format_code_sample(example):
            """Format code sample as instruction-response pair"""
            code = example.get('content', '')
            # Create a simple instruction to complete/generate code
            # You can customize this based on your needs
            return {
                "input": f"Write a {STARCODER_LANGUAGE} code snippet:",
                "output": code[:2000]  # Limit code length to avoid issues
            }
        
        # Convert to pandas DataFrame
        formatted_data = dataset.map(format_code_sample)
        df = pd.DataFrame({
            'input': formatted_data['input'],
            'output': formatted_data['output']
        })
        
        print(f"‚úÖ Converted to training format: {len(df)} examples")
        print(f"\nDataset preview:")
        print(df.head(3))
        
    except Exception as e:
        print(f"‚ùå Error loading StarCoder dataset: {e}")
        print("Falling back to sample dataset...")
        
        sample_data = {
            "input": [
                "Write a python function:",
                "Write a python class:",
                "Write a python script:"
            ],
            "output": [
                "def hello_world():\n    print('Hello, World!')",
                "class MyClass:\n    def __init__(self):\n        self.value = 0",
                "#!/usr/bin/env python3\nprint('Hello from Python!')"
            ]
        }
        df = pd.DataFrame(sample_data)
        print(f"Created sample dataset with {len(df)} examples")
        
else:
    # Traditional CSV download from backend
    print("üì• Downloading dataset from backend...")
    
    try:
        # Fetch the dataset file from the backend
        response = requests.get(f"{API_URL}/api/train/dataset/{JOB_ID}")
        
        if response.status_code == 200:
            # Save the dataset locally
            dataset_path = f"training_dataset_{JOB_ID}.csv"
            with open(dataset_path, 'wb') as f:
                f.write(response.content)
            
            # Load the dataset
            df = pd.read_csv(dataset_path)
            print(f"‚úÖ Successfully downloaded dataset: {len(df)} examples")
            print(f"\nDataset preview:")
            print(df.head())
            
            # Validate dataset format
            if 'input' not in df.columns or 'output' not in df.columns:
                raise ValueError("Dataset must have 'input' and 'output' columns")
            
            print(f"\n‚úì Dataset columns: {list(df.columns)}")
            print(f"‚úì Training examples: {len(df)}")
            
        else:
            print(f"‚ö†Ô∏è Could not download dataset from backend (status: {response.status_code})")
            print("Using sample dataset for demonstration...")
            
            # Fallback to sample dataset
            sample_data = {
                "input": [
                    "What is machine learning?",
                    "Explain neural networks",
                    "What is deep learning?",
                    "What is supervised learning?",
                    "Explain backpropagation"
                ],
                "output": [
                    "Machine learning is a subset of AI that enables systems to learn from data without being explicitly programmed.",
                    "Neural networks are computational models inspired by the human brain, consisting of interconnected nodes that process information.",
                    "Deep learning is a subset of machine learning using multi-layered neural networks to learn hierarchical representations.",
                    "Supervised learning is a machine learning approach where models learn from labeled training data to make predictions.",
                    "Backpropagation is an algorithm for training neural networks by calculating gradients and updating weights to minimize error."
                ]
            }
            df = pd.DataFrame(sample_data)
            print(f"Created sample dataset with {len(df)} examples")

    except Exception as e:
        print(f"‚ùå Error downloading dataset: {e}")
        print("Using sample dataset for demonstration...")
        
        sample_data = {
            "input": [
                "What is machine learning?",
                "Explain neural networks",
                "What is deep learning?",
                "What is supervised learning?",
                "Explain backpropagation"
            ],
            "output": [
                "Machine learning is a subset of AI that enables systems to learn from data without being explicitly programmed.",
                "Neural networks are computational models inspired by the human brain, consisting of interconnected nodes that process information.",
                "Deep learning is a subset of machine learning using multi-layered neural networks to learn hierarchical representations.",
                "Supervised learning is a machine learning approach where models learn from labeled training data to make predictions.",
                "Backpropagation is an algorithm for training neural networks by calculating gradients and updating weights to minimize error."
            ]
        }
        df = pd.DataFrame(sample_data)
        print(f"Created sample dataset with {len(df)} examples")

update_status("training", 10)

In [None]:
# Import training libraries
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import Dataset

update_status("training", 15)

In [None]:
# Configure 4-bit quantization for efficient training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
print(f"Loading {model_id}...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Model loaded successfully!")
update_status("training", 25)

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")

update_status("training", 30)

In [None]:
# Prepare dataset for training
def format_instruction(sample):
    return f"### Input:\n{sample['input']}\n\n### Output:\n{sample['output']}"

# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(lambda x: {"text": format_instruction(x)})

print(f"Dataset prepared with {len(dataset)} examples")
update_status("training", 35)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./lora_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=2,
    logging_steps=10,
    save_steps=50,
    warmup_steps=10,
    max_grad_norm=0.3,
    group_by_length=True,
    lr_scheduler_type="cosine",
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=512,
)

print("Trainer configured successfully!")
update_status("training", 40)

In [None]:
# Train the model
print("üöÄ Starting training...")
update_status("training", 45)

try:
    trainer.train()
    print("‚úÖ Training completed successfully!")
    update_status("training", 90)
except Exception as e:
    print(f"‚ùå Training failed: {e}")
    update_status("failed", 0, str(e))
    raise

In [None]:
# Save the trained model
output_dir = f"./lora_model_{JOB_ID}"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")
update_status("training", 95)

In [None]:
# Test the trained model
print("\nüß™ Testing trained model...")
test_prompt = "What is machine learning?"
inputs = tokenizer(format_instruction({"input": test_prompt, "output": ""}), return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nPrompt: {test_prompt}")
print(f"Response: {response}")

In [None]:
# Complete the training job
update_status("completed", 100)
print("\n‚úÖ Training pipeline completed successfully!")
print(f"\nüì¶ Your trained LoRA adapter is ready in: {output_dir}")
print("\nüí° To use this model:")
print("1. Download the adapter files from this Colab")
print("2. Load it with PEFT in your application")
print("3. Merge with base model or use directly")