# Colab Training Test - Qwen3 0.8B on Linux Commands\n
\n
**Purpose:** Compare Colab Pro vs Kaggle for training ease and speed\n
\n
**Dataset:** AnishJoshi/nl2bash-custom (Linux commands)\n
\n
**Expected Time:** 2-3 hours on A100

In [None]:
# Install dependencies\n
!pip install -q transformers datasets accelerate peft bitsandbytes trl torch\n
\n
print("\\n✅ Installation complete")

In [None]:
import torch\n
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n
from datasets import load_dataset\n
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n
from trl import SFTTrainer, SFTConfig\n
\n
print(f"PyTorch version: {torch.__version__}")\n
print(f"CUDA available: {torch.cuda.is_available()}")\n
if torch.cuda.is_available():\n
    print(f"GPU: {torch.cuda.get_device_name(0)}")\n
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# Configuration\n
MODEL_NAME = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B"\n
OUTPUT_DIR = "./qwen3-colab-linux-test"\n
DATASET_NAME = "AnishJoshi/nl2bash-custom"\n
\n
# Training hyperparameters\n
BATCH_SIZE = 4\n
GRADIENT_ACCUMULATION = 4\n
LEARNING_RATE = 2e-4\n
NUM_EPOCHS = 3\n
MAX_SEQ_LENGTH = 2048\n
\n
# LoRA config\n
LORA_R = 16\n
LORA_ALPHA = 32\n
LORA_DROPOUT = 0.05\n
\n
# CRITICAL: Same padding token\n
PAD_TOKEN_ID = 151645\n
\n
print("✅ Configuration set")

In [None]:
# Load tokenizer\n
print("Loading tokenizer from HuggingFace...")\n
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n
\n
# CRITICAL: Force same padding token\n
tokenizer.pad_token_id = PAD_TOKEN_ID\n
tokenizer.padding_side = "right"\n
\n
print(f"✅ Tokenizer loaded")\n
print(f"Pad token ID: {tokenizer.pad_token_id}")

In [None]:
# Load model with 4-bit quantization\n
print("="*60)\n
print("Loading Model from HuggingFace")\n
print("="*60)\n
\n
print("\\nConfiguring 4-bit quantization...")\n
bnb_config = BitsAndBytesConfig(\n
    load_in_4bit=True,\n
    bnb_4bit_quant_type="nf4",\n
    bnb_4bit_compute_dtype=torch.float16,\n
    bnb_4bit_use_double_quant=True\n
)\n
\n
print(f"Loading {MODEL_NAME}...")\n
model = AutoModelForCausalLM.from_pretrained(\n
    MODEL_NAME,\n
    quantization_config=bnb_config,\n
    device_map="auto",\n
    trust_remote_code=True\n
)\n
\n
print("Preparing model for LoRA training...")\n
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)\n
\n
print(f"✅ Model loaded on device: {model.device}")

In [None]:
# Configure LoRA\n
print("\\nApplying LoRA configuration...")\n
lora_config = LoraConfig(\n
    r=LORA_R,\n
    lora_alpha=LORA_ALPHA,\n
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],\n
    lora_dropout=LORA_DROPOUT,\n
    bias="none",\n
    task_type="CAUSAL_LM"\n
)\n
\n
model = get_peft_model(model, lora_config)\n
print("\\n✅ LoRA applied")\n
model.print_trainable_parameters()

In [None]:
# Load Linux commands dataset\n
print("Loading Linux commands dataset...")\n
dataset = load_dataset(DATASET_NAME, split="train")\n
print(f"Dataset size: {len(dataset)} examples")\n
print(f"\\nSample entry:")\n
print(dataset[0])

In [None]:
# Format dataset for Linux command training\n
def format_instruction(example):\n
    description = example.get('description', example.get('question', ''))\n
    command = example.get('cmd', example.get('answer', ''))\n
    \n
    prompt = f"Instruction: {description}\\n\\nResponse:"\n
    text = f"{prompt} {command}"\n
    return {"text": text}\n
\n
dataset = dataset.map(format_instruction)\n
print("\\nFormatted example:")\n
print(dataset[0]['text'][:500])

In [None]:
# Training arguments\n
training_args = SFTConfig(\n
    output_dir=OUTPUT_DIR,\n
    num_train_epochs=NUM_EPOCHS,\n
    per_device_train_batch_size=BATCH_SIZE,\n
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,\n
    learning_rate=LEARNING_RATE,\n
    max_seq_length=MAX_SEQ_LENGTH,\n
    logging_steps=10,\n
    save_strategy="epoch",\n
    fp16=True,\n
    optim="paged_adamw_8bit",\n
    warmup_ratio=0.03,\n
    lr_scheduler_type="cosine",\n
    report_to="none",\n
    dataset_text_field="text",\n
    packing=False,\n
)\n
\n
print("✅ Training arguments configured")

In [None]:
# Create trainer and start training\n
print("\\n" + "="*60)\n
print("STARTING TRAINING")\n
print("="*60)\n
\n
trainer = SFTTrainer(\n
    model=model,\n
    args=training_args,\n
    train_dataset=dataset,\n
    tokenizer=tokenizer,\n
)\n
\n
print(f"\\nTraining on {len(dataset)} examples...")\n
print(f"Total steps: {len(dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * NUM_EPOCHS}")\n
print("\\nStarting training...\\n")\n
\n
trainer.train()\n
\n
print("\\n" + "="*60)\n
print("✅ TRAINING COMPLETE!")\n
print("="*60)

In [None]:
# Save the trained LoRA adapter\n
print("\\nSaving LoRA adapter...")\n
model.save_pretrained(OUTPUT_DIR)\n
tokenizer.save_pretrained(OUTPUT_DIR)\n
\n
import os\n
adapter_path = f"{OUTPUT_DIR}/adapter_model.safetensors"\n
if os.path.exists(adapter_path):\n
    adapter_size = os.path.getsize(adapter_path) / 1e6\n
    print(f"✅ Adapter saved: {adapter_size:.1f} MB")\n
else:\n
    print(f"❌ ERROR: Adapter file not found")

# Summary\n
\n
**Colab vs Kaggle Comparison:**\n
\n
Record your observations:\n
- Setup time\n
- Training time\n
- Ease of use\n
- GPU type assigned\n
- Any errors encountered\n
- Overall experience