# Phase 1: Python & Advanced Python Training

**NEW APPROACH:** Skip Linux commands (model already knows them)

**CRITICAL FIXES:**
1. ✅ Uses proper Qwen3 chat template format
2. ✅ Lower learning rate (5e-5) to prevent overfitting
3. ✅ Weight decay (0.1) for regularization
4. ✅ Validation split to monitor generalization
5. ✅ Save every 100 steps for monitoring
6. ✅ Google Drive auto-backup

**Dataset:** Python code examples (system automation, scripting)

**Expected Time:** 2-3 hours on A100

In [1]:
# Mount Google Drive FIRST
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_OUTPUT = "/content/drive/MyDrive/qwen3_phase1_python_adapters"
os.makedirs(DRIVE_OUTPUT, exist_ok=True)

print(f"✅ Google Drive mounted")
print(f"✅ Output will be saved to: {DRIVE_OUTPUT}")

Mounted at /content/drive
✅ Google Drive mounted
✅ Output will be saved to: /content/drive/MyDrive/qwen3_phase1_python_adapters


In [2]:
# Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes trl torch

print("\n✅ Installation complete")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.6/564.6 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25h
✅ Installation complete


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
GPU Memory: 42.47 GB


In [4]:
# Configuration - PYTHON DATASET (PRODUCTION RUN)
MODEL_NAME = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B"
OUTPUT_DIR = "./qwen3-phase1-python"
DATASET_NAME = "iamtarun/python_code_instructions_18k_alpaca"  # Reliable Python dataset

# Training hyperparameters - PRODUCTION: 1 full epoch
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 5e-5  # Validated from baseline
WEIGHT_DECAY = 0.1  # Validated from baseline
NUM_EPOCHS = 1  # Single full epoch
MAX_STEPS = 1050  # Full epoch: 16,750 examples / (4 batch * 4 accum) ≈ 1,047 steps
MAX_SEQ_LENGTH = 2048
MAX_DATASET_SIZE = 50000

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# CRITICAL: Use proper padding token
PAD_TOKEN_ID = 151645
print(f"📊 Dataset: {DATASET_NAME}")
print("✅ Configuration set")
print(f"⚠️  LEARNING RATE: {LEARNING_RATE}")
print(f"⚠️  WEIGHT DECAY: {WEIGHT_DECAY}")
print(f"🚀 PRODUCTION RUN: 1 full epoch ({MAX_STEPS} steps, ~2 hours)")

📊 Dataset: iamtarun/python_code_instructions_18k_alpaca
✅ Configuration set
⚠️  LEARNING RATE: 5e-05
⚠️  WEIGHT DECAY: 0.1
🚀 PRODUCTION RUN: 1 full epoch (1050 steps, ~2 hours)


In [5]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# CRITICAL: Force same padding token
tokenizer.pad_token_id = PAD_TOKEN_ID
tokenizer.padding_side = "right"

print(f"✅ Tokenizer loaded")
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")

if tokenizer.pad_token_id != 151645:
    raise ValueError(f"❌ Wrong pad token! Got {tokenizer.pad_token_id}, expected 151645")

Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

✅ Tokenizer loaded
Pad token ID: 151645
EOS token ID: 151645


In [6]:
# Load model with 4-bit quantization
print("="*60)
print("Loading Model")
print("="*60)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Preparing model for LoRA training...")
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

print(f"✅ Model loaded on device: {model.device}")

Loading Model
Loading DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B...


config.json:   0%|          | 0.00/822 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Preparing model for LoRA training...
✅ Model loaded on device: cuda:0


In [7]:
# Configure LoRA
print("\nApplying LoRA configuration...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("\n✅ LoRA applied")
model.print_trainable_parameters()


Applying LoRA configuration...

✅ LoRA applied
trainable params: 6,881,280 || all params: 823,164,416 || trainable%: 0.8360


In [8]:
# Load Python dataset and split into train/validation
print("Loading Python code dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Total dataset size: {len(dataset)} examples")

# Create train/validation split (90/10)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
val_dataset = dataset['test']

print(f"Train set: {len(train_dataset)} examples")
print(f"Validation set: {len(val_dataset)} examples")
print(f"\nSample entry:")
print(train_dataset[0])

Loading Python code dataset...


README.md:   0%|          | 0.00/905 [00:00<?, ?B/s]

data/train-00000-of-00001-8b6e212f3e1ece(…):   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18612 [00:00<?, ? examples/s]

Total dataset size: 18612 examples
Train set: 16750 examples
Validation set: 1862 examples

Sample entry:
{'instruction': 'Design a python code to convert a given sentence to camelCase', 'input': '', 'output': 'def toCamelCase(s):\n    s = s.split(\' \')\n    return \'\'.join(x.title() for x in s)\n\ns = "this is some random text"\nprint(toCamelCase(s))', 'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDesign a python code to convert a given sentence to camelCase\n\n### Input:\n\n\n### Output:\ndef toCamelCase(s):\n    s = s.split(\' \')\n    return \'\'.join(x.title() for x in s)\n\ns = "this is some random text"\nprint(toCamelCase(s))'}


In [9]:
# Format dataset using PROPER Qwen3 chat template
def format_chat_template(example):
    """Format using Qwen3 chat template for Python code"""
    instruction = example.get('instruction', '')
    input_text = example.get('input', '')
    output = example.get('output', '')
    # Combine instruction and input if input exists (Alpaca format)
    if input_text:
        user_content = f"{instruction}\n\nInput: {input_text}"
    else:
        user_content = instruction

    # Create messages in chat format
    messages = [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": output}
    ]

    # Apply chat template WITHOUT adding generation prompt (we have the full conversation)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    return {"text": text}

print("Formatting datasets with Qwen3 chat template...")
train_dataset = train_dataset.map(format_chat_template)
val_dataset = val_dataset.map(format_chat_template)

print("\n✅ Formatted with chat template")
print("\nFormatted example:")
print(train_dataset[0]['text'][:500])

Formatting datasets with Qwen3 chat template...


Map:   0%|          | 0/16750 [00:00<?, ? examples/s]

Map:   0%|          | 0/1862 [00:00<?, ? examples/s]


✅ Formatted with chat template

Formatted example:
<|im_start|>user
Design a python code to convert a given sentence to camelCase<|im_end|>
<|im_start|>assistant
<think>

</think>

def toCamelCase(s):
    s = s.split(' ')
    return ''.join(x.title() for x in s)

s = "this is some random text"
print(toCamelCase(s))<|im_end|>



In [10]:
# Training arguments - PRODUCTION RUN
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,  # Full epoch: 1050 steps
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,

    # CRITICAL FIXES:
    learning_rate=LEARNING_RATE,  # 5e-5
    weight_decay=WEIGHT_DECAY,    # 0.1 for regularization

    # Evaluation settings
    eval_strategy="steps",
    eval_steps=100,
    per_device_eval_batch_size=BATCH_SIZE,

    # Logging
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,

    # Training settings
    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,  # Gradient clipping

    report_to="none",

    # CRITICAL: Load best model at end
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

print("✅ Training arguments configured")
print(f"⚠️  Learning rate: {LEARNING_RATE}")
print(f"⚠️  Weight decay: {WEIGHT_DECAY}")
print(f"⚠️  Will evaluate and save every 100 steps")

✅ Training arguments configured
⚠️  Learning rate: 5e-05
⚠️  Weight decay: 0.1
⚠️  Will evaluate and save every 100 steps


In [11]:
# Create trainer with validation set
print("\n" + "="*60)
print("CREATING TRAINER WITH VALIDATION")
print("="*60)

# WORKAROUND: Remove all non-text columns to force SFTTrainer to use 'text' field only
train_dataset_clean = train_dataset.remove_columns([col for col in train_dataset.column_names if col != 'text'])
val_dataset_clean = val_dataset.remove_columns([col for col in val_dataset.column_names if col != 'text'])

print(f"Cleaned dataset columns: {train_dataset_clean.column_names}")

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_clean,
    eval_dataset=val_dataset_clean,
)

print(f"\nTraining on {len(train_dataset)} examples...")
print(f"Validating on {len(val_dataset)} examples...")
print(f"Total steps: {len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * NUM_EPOCHS}")
print("\n⚠️  MONITOR: Watch for loss = 0.0 before step 500 (indicates overfitting)")
print("⚠️  HEALTHY: Loss should drop gradually and plateau around 0.3-0.5")


CREATING TRAINER WITH VALIDATION
Cleaned dataset columns: ['text']


Adding EOS to train dataset:   0%|          | 0/16750 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16750 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16750 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/1862 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1862 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1862 [00:00<?, ? examples/s]


Training on 16750 examples...
Validating on 1862 examples...
Total steps: 1046

⚠️  MONITOR: Watch for loss = 0.0 before step 500 (indicates overfitting)
⚠️  HEALTHY: Loss should drop gradually and plateau around 0.3-0.5


In [12]:
# Start training
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
print("\nStarting training...\n")

trainer.train()

print("\n" + "="*60)
print("✅ TRAINING COMPLETE!")
print("="*60)

# Show final metrics
final_train_loss = trainer.state.log_history[-1].get('loss', 'N/A')
final_eval_loss = trainer.state.log_history[-1].get('eval_loss', 'N/A')
print(f"\nFinal train loss: {final_train_loss}")
print(f"Final eval loss: {final_eval_loss}")

if isinstance(final_train_loss, float) and final_train_loss < 0.1:
    print("\n⚠️  WARNING: Train loss is very low - model may have overfit!")
    print("⚠️  Check validation loss to confirm generalization")


STARTING TRAINING

Starting training...



Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
100,1.0734,1.027691,1.026345,249952.0,0.752882
200,0.9463,0.908959,0.879523,489311.0,0.769643
300,0.9216,0.881941,0.896996,732628.0,0.774677
400,0.9908,0.868258,0.877337,978063.0,0.777176
500,0.9345,0.858331,0.862759,1236058.0,0.779199
600,0.9353,0.851778,0.866607,1487829.0,0.780893
700,0.9525,0.846811,0.856552,1743007.0,0.781725
800,0.8587,0.843309,0.839362,1985707.0,0.78259
900,0.7965,0.841982,0.836677,2234230.0,0.782816
1000,0.8624,0.841566,0.83916,2477213.0,0.782957



✅ TRAINING COMPLETE!

Final train loss: N/A
Final eval loss: N/A


In [13]:
# Save the trained LoRA adapter to LOCAL storage first
print("\nSaving LoRA adapter to local storage...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

import os
adapter_path = f"{OUTPUT_DIR}/adapter_model.safetensors"
if os.path.exists(adapter_path):
    adapter_size = os.path.getsize(adapter_path) / 1e6
    print(f"✅ Adapter saved locally: {adapter_size:.1f} MB")
else:
    print(f"❌ ERROR: Adapter file not found at {adapter_path}")
    raise FileNotFoundError("Adapter not saved!")


Saving LoRA adapter to local storage...
✅ Adapter saved locally: 27.6 MB


In [14]:
# CRITICAL: Copy to Google Drive for permanent storage
import shutil

print("\n" + "="*60)
print("COPYING TO GOOGLE DRIVE (PERMANENT BACKUP)")
print("="*60)

# Copy entire adapter directory to Drive
drive_backup = DRIVE_OUTPUT
if os.path.exists(drive_backup):
    shutil.rmtree(drive_backup)

shutil.copytree(OUTPUT_DIR, drive_backup)

# Verify the copy
drive_adapter_path = f"{drive_backup}/adapter_model.safetensors"
if os.path.exists(drive_adapter_path):
    drive_size = os.path.getsize(drive_adapter_path) / 1e6
    print(f"\n✅ BACKUP COMPLETE!")
    print(f"✅ Adapter backed up to Google Drive: {drive_size:.1f} MB")
    print(f"✅ Location: {drive_backup}")
    print(f"\n🎉 You can now find the adapters in Google Drive under:")
    print(f"   MyDrive/qwen3_phase1_python_adapters/")
else:
    print(f"❌ ERROR: Backup failed!")
    raise FileNotFoundError("Drive backup failed!")


COPYING TO GOOGLE DRIVE (PERMANENT BACKUP)

✅ BACKUP COMPLETE!
✅ Adapter backed up to Google Drive: 27.6 MB
✅ Location: /content/drive/MyDrive/qwen3_phase1_python_adapters

🎉 You can now find the adapters in Google Drive under:
   MyDrive/qwen3_phase1_python_adapters/


# ✅ Training Complete!

Your LoRA adapters are now safely stored in:
- **Google Drive:** `MyDrive/qwen3_phase1_python_adapters/`
- **Local sync:** Should appear in `~/GoogleDrive/qwen3_phase1_python_adapters/`

## This Phase:

1. ✅ **Python code training** - System automation, scripting
2. ✅ **Proper chat template** - Qwen3 format
3. ✅ **Same settings** - LR 5e-5, weight decay 0.1
4. ✅ **Monitoring** - Watch for overfitting at step 100, 200, etc.

## Next Steps:

If this works without overfitting, the model should learn Python patterns without memorizing.