# Phase 1: Linux Commands Training - WITH GOOGLE DRIVE BACKUP

**IMPORTANT:** This version saves adapters to Google Drive automatically

**Dataset:** AnishJoshi/nl2bash-custom (24.6K Linux commands)

**Expected Time:** 2-3 hours on A100

In [None]:
# Mount Google Drive FIRST
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_OUTPUT = "/content/drive/MyDrive/qwen3_phase1_linux_adapters"
os.makedirs(DRIVE_OUTPUT, exist_ok=True)

print(f"✅ Google Drive mounted")
print(f"✅ Output will be saved to: {DRIVE_OUTPUT}")

In [None]:
# Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes trl torch

print("\n✅ Installation complete")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# Configuration
MODEL_NAME = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B"
OUTPUT_DIR = "./qwen3-colab-linux-phase1"
DATASET_NAME = "AnishJoshi/nl2bash-custom"

# Training hyperparameters
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 2048

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# CRITICAL: Same padding token as Kaggle
PAD_TOKEN_ID = 151645

print("✅ Configuration set")

In [None]:
# Load tokenizer
print("Loading tokenizer from HuggingFace...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# CRITICAL: Force same padding token
tokenizer.pad_token_id = PAD_TOKEN_ID
tokenizer.padding_side = "right"

print(f"✅ Tokenizer loaded")
print(f"Pad token ID: {tokenizer.pad_token_id}")

In [None]:
# Load model with 4-bit quantization
print("="*60)
print("Loading Model from HuggingFace")
print("="*60)

print("\nConfiguring 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Preparing model for LoRA training...")
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

print(f"✅ Model loaded on device: {model.device}")

In [None]:
# Configure LoRA
print("\nApplying LoRA configuration...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("\n✅ LoRA applied")
model.print_trainable_parameters()

In [None]:
# Load Linux commands dataset
print("Loading Linux commands dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Dataset size: {len(dataset)} examples")
print(f"\nSample entry:")
print(dataset[0])

In [None]:
# Format dataset for Linux command training
def format_instruction(example):
    description = example.get('description', example.get('question', ''))
    command = example.get('cmd', example.get('answer', ''))
    
    prompt = f"Instruction: {description}\n\nResponse:"
    text = f"{prompt} {command}"
    return {"text": text}

dataset = dataset.map(format_instruction)
print("\nFormatted example:")
print(dataset[0]['text'][:500])

In [None]:
# Training arguments
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="none",
    dataset_text_field="text",
    packing=False,
)

print("✅ Training arguments configured")

In [None]:
# Create trainer and start training
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

print(f"\nTraining on {len(dataset)} examples...")
print(f"Total steps: {len(dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * NUM_EPOCHS}")
print("\nStarting training...\n")

trainer.train()

print("\n" + "="*60)
print("✅ TRAINING COMPLETE!")
print("="*60)

In [None]:
# Save the trained LoRA adapter to LOCAL storage first
print("\nSaving LoRA adapter to local storage...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

import os
adapter_path = f"{OUTPUT_DIR}/adapter_model.safetensors"
if os.path.exists(adapter_path):
    adapter_size = os.path.getsize(adapter_path) / 1e6
    print(f"✅ Adapter saved locally: {adapter_size:.1f} MB")
else:
    print(f"❌ ERROR: Adapter file not found at {adapter_path}")
    raise FileNotFoundError("Adapter not saved!")

In [None]:
# CRITICAL: Copy to Google Drive for permanent storage
import shutil

print("\n" + "="*60)
print("COPYING TO GOOGLE DRIVE (PERMANENT BACKUP)")
print("="*60)

# Copy entire adapter directory to Drive
drive_backup = DRIVE_OUTPUT
if os.path.exists(drive_backup):
    shutil.rmtree(drive_backup)

shutil.copytree(OUTPUT_DIR, drive_backup)

# Verify the copy
drive_adapter_path = f"{drive_backup}/adapter_model.safetensors"
if os.path.exists(drive_adapter_path):
    drive_size = os.path.getsize(drive_adapter_path) / 1e6
    print(f"\n✅ BACKUP COMPLETE!")
    print(f"✅ Adapter backed up to Google Drive: {drive_size:.1f} MB")
    print(f"✅ Location: {drive_backup}")
    print(f"\n🎉 You can now find the adapters in Google Drive under:")
    print(f"   MyDrive/qwen3_phase1_linux_adapters/")
else:
    print(f"❌ ERROR: Backup failed!")
    raise FileNotFoundError("Drive backup failed!")

# ✅ Training Complete!

Your LoRA adapters are now safely stored in:
- **Google Drive:** `MyDrive/qwen3_phase1_linux_adapters/`
- **Local sync:** Should appear in `~/GoogleDrive/qwen3_phase1_linux_adapters/`

Files saved:
- `adapter_model.safetensors` (~42MB)
- `adapter_config.json`
- `tokenizer files`