# Phase 1: Python & Advanced Python Training

**NEW APPROACH:** Skip Linux commands (model already knows them)

**CRITICAL FIXES:**
1. ✅ Uses proper Qwen3 chat template format
2. ✅ Lower learning rate (5e-5) to prevent overfitting
3. ✅ Weight decay (0.1) for regularization
4. ✅ Validation split to monitor generalization
5. ✅ Save every 100 steps for monitoring
6. ✅ Google Drive auto-backup

**Dataset:** Python code examples (system automation, scripting)

**Expected Time:** 2-3 hours on A100

In [None]:
# Mount Google Drive FIRST
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_OUTPUT = "/content/drive/MyDrive/qwen3_phase1_python_adapters"
os.makedirs(DRIVE_OUTPUT, exist_ok=True)

print(f"✅ Google Drive mounted")
print(f"✅ Output will be saved to: {DRIVE_OUTPUT}")

In [None]:
# Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes trl torch

print("\n✅ Installation complete")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# Configuration - PYTHON DATASET (TEST RUN)
MODEL_NAME = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B"
OUTPUT_DIR = "./qwen3-phase1-python-TEST"
DATASET_NAME = "iamtarun/python_code_instructions_18k_alpaca"  # Reliable Python dataset

# Training hyperparameters - SHORT TEST RUN
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 5e-5  # Testing this learning rate
WEIGHT_DECAY = 0.1
NUM_EPOCHS = 1  # CHANGED: Only 1 epoch for quick test (was 3)
MAX_STEPS = 300  # ADDED: Stop after 300 steps (~20 min)
MAX_SEQ_LENGTH = 2048
MAX_DATASET_SIZE = 50000  # Limit dataset size for test run

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# CRITICAL: Use proper padding token
PAD_TOKEN_ID = 151645
print(f"📊 Dataset: {DATASET_NAME} ({DATASET_SUBSET})")
print("✅ Configuration set")
print(f"⚠️  Max dataset size: {MAX_DATASET_SIZE} examples")
print(f"⚠️  LEARNING RATE: {LEARNING_RATE}")

print(f"⚠️  WEIGHT DECAY: {WEIGHT_DECAY}")print(f"⚠️  LEARNING RATE: {LEARNING_RATE}")

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# CRITICAL: Force same padding token
tokenizer.pad_token_id = PAD_TOKEN_ID
tokenizer.padding_side = "right"

print(f"✅ Tokenizer loaded")
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")

if tokenizer.pad_token_id != 151645:
    raise ValueError(f"❌ Wrong pad token! Got {tokenizer.pad_token_id}, expected 151645")

In [None]:
# Load model with 4-bit quantization
print("="*60)
print("Loading Model")
print("="*60)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Preparing model for LoRA training...")
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

print(f"✅ Model loaded on device: {model.device}")

In [None]:
# Configure LoRA
print("\nApplying LoRA configuration...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("\n✅ LoRA applied")
model.print_trainable_parameters()

In [None]:
# Load Python dataset and split into train/validation
print("Loading Python code dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Total dataset size: {len(dataset)} examples")

# Create train/validation split (90/10)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
val_dataset = dataset['test']

print(f"Train set: {len(train_dataset)} examples")
print(f"Validation set: {len(val_dataset)} examples")
print(f"\nSample entry:")
print(train_dataset[0])

In [None]:
# Format dataset using PROPER Qwen3 chat template
def format_chat_template(example):
    """Format using Qwen3 chat template for Python code"""
    instruction = example.get('instruction', '')
    input_text = example.get('input', '')
    output = example.get('output', '')
    # Combine instruction and input if input exists (Alpaca format)
    if input_text:
        user_content = f"{instruction}\n\nInput: {input_text}"
    else:
        user_content = instruction
    
    # Create messages in chat format
    messages = [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": output}
    ]
    
    # Apply chat template WITHOUT adding generation prompt (we have the full conversation)
    text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False,
        add_generation_prompt=False
    )
    
    return {"text": text}

print("Formatting datasets with Qwen3 chat template...")
train_dataset = train_dataset.map(format_chat_template)
val_dataset = val_dataset.map(format_chat_template)

print("\n✅ Formatted with chat template")
print("\nFormatted example:")

print(train_dataset[0]['text'][:500])print(train_dataset[0]['text'][:500])

In [None]:
# Training arguments - SHORT TEST RUN
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,  # ADDED: Stop after 300 steps
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,

    # CRITICAL FIXES:
    learning_rate=LEARNING_RATE,  # 5e-5
    weight_decay=WEIGHT_DECAY,    # 0.1 for regularization

    # Evaluation settings
    eval_strategy="steps",
    eval_steps=100,
    per_device_eval_batch_size=BATCH_SIZE,

    # Logging
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,

    # Training settings
    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,  # Gradient clipping

    report_to="none",

    # CRITICAL: Load best model at end
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

print("✅ Training arguments configured")
print(f"⚠️  Learning rate: {LEARNING_RATE}")
print(f"⚠️  Weight decay: {WEIGHT_DECAY}")
print(f"⚠️  Will evaluate and save every 100 steps")

In [None]:
# Create trainer with validation set
print("\n" + "="*60)
print("CREATING TRAINER WITH VALIDATION")
print("="*60)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print(f"\nTraining on {len(train_dataset)} examples...")
print(f"Validating on {len(val_dataset)} examples...")
print(f"Total steps: {len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * NUM_EPOCHS}")
print("\n⚠️  MONITOR: Watch for loss = 0.0 before step 500 (indicates overfitting)")
print("⚠️  HEALTHY: Loss should drop gradually and plateau around 0.3-0.5")

In [None]:
# Start training
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
print("\nStarting training...\n")

trainer.train()

print("\n" + "="*60)
print("✅ TRAINING COMPLETE!")
print("="*60)

# Show final metrics
final_train_loss = trainer.state.log_history[-1].get('loss', 'N/A')
final_eval_loss = trainer.state.log_history[-1].get('eval_loss', 'N/A')
print(f"\nFinal train loss: {final_train_loss}")
print(f"Final eval loss: {final_eval_loss}")

if isinstance(final_train_loss, float) and final_train_loss < 0.1:
    print("\n⚠️  WARNING: Train loss is very low - model may have overfit!")
    print("⚠️  Check validation loss to confirm generalization")

In [None]:
# Save the trained LoRA adapter to LOCAL storage first
print("\nSaving LoRA adapter to local storage...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

import os
adapter_path = f"{OUTPUT_DIR}/adapter_model.safetensors"
if os.path.exists(adapter_path):
    adapter_size = os.path.getsize(adapter_path) / 1e6
    print(f"✅ Adapter saved locally: {adapter_size:.1f} MB")
else:
    print(f"❌ ERROR: Adapter file not found at {adapter_path}")
    raise FileNotFoundError("Adapter not saved!")

In [None]:
# CRITICAL: Copy to Google Drive for permanent storage
import shutil

print("\n" + "="*60)
print("COPYING TO GOOGLE DRIVE (PERMANENT BACKUP)")
print("="*60)

# Copy entire adapter directory to Drive
drive_backup = DRIVE_OUTPUT
if os.path.exists(drive_backup):
    shutil.rmtree(drive_backup)

shutil.copytree(OUTPUT_DIR, drive_backup)

# Verify the copy
drive_adapter_path = f"{drive_backup}/adapter_model.safetensors"
if os.path.exists(drive_adapter_path):
    drive_size = os.path.getsize(drive_adapter_path) / 1e6
    print(f"\n✅ BACKUP COMPLETE!")
    print(f"✅ Adapter backed up to Google Drive: {drive_size:.1f} MB")
    print(f"✅ Location: {drive_backup}")
    print(f"\n🎉 You can now find the adapters in Google Drive under:")
    print(f"   MyDrive/qwen3_phase1_python_adapters/")
else:
    print(f"❌ ERROR: Backup failed!")
    raise FileNotFoundError("Drive backup failed!")

# ✅ Training Complete!

Your LoRA adapters are now safely stored in:
- **Google Drive:** `MyDrive/qwen3_phase1_python_adapters/`
- **Local sync:** Should appear in `~/GoogleDrive/qwen3_phase1_python_adapters/`

## This Phase:

1. ✅ **Python code training** - System automation, scripting
2. ✅ **Proper chat template** - Qwen3 format
3. ✅ **Same settings** - LR 5e-5, weight decay 0.1
4. ✅ **Monitoring** - Watch for overfitting at step 100, 200, etc.

## Next Steps:

If this works without overfitting, the model should learn Python patterns without memorizing.