# Phase 2: Linux Command Mastery

Training Phase 1 output on Linux command and bash datasets

**Input:**
- Base model: `/kaggle/input/qwen3-08b-coder-reasoning` (1.6GB - already uploaded)
- Phase 1 LoRA: `/kaggle/input/qwen3-phase1-lora-adapter` (42MB - just uploaded!)

**Workflow:**
1. Load base model + Phase 1 LoRA adapter
2. Merge them on Kaggle (saves bandwidth!)
3. Train Phase 2 LoRA on Linux datasets
4. Save Phase 2 LoRA adapter for Phase 3

**Datasets:**
- Kaggle: complex-linux-commands-from-natural-language (1M examples)
- Kaggle: linux-terminal-commands-dataset
- HuggingFace: aelhalili/bash-commands-dataset
- HuggingFace: m-a-p/CodeFeedback-Filtered-Instruction (bash/shell filtered)

**Expected Time:** 3-4 hours on T4 GPU

In [None]:
# Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes trl pandas

In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, concatenate_datasets, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Configuration
BASE_MODEL_PATH = "/kaggle/input/qwen3-08b-coder-reasoning"  # Original base model
PHASE1_LORA_PATH = "/kaggle/input/qwen3-phase1-lora-adapter"  # Phase 1 LoRA adapter
MERGED_TEMP_PATH = "/kaggle/working/phase1_merged_temp"  # Temporary path for merged model
OUTPUT_DIR = "/kaggle/working/qwen3-08b-phase2-linux"

# Training hyperparameters (OPTIMIZED FOR MEMORY)
BATCH_SIZE = 2  # Reduced from 4 to save memory
GRADIENT_ACCUMULATION = 8  # Increased to keep effective batch size = 16
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 2048

In [None]:
# Load base model in full precision for merging
print("="*60)
print("STEP 1: MERGING PHASE 1 LORA WITH BASE MODEL")
print("="*60)

from peft import PeftModel

print("\nLoading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print("Loading Phase 1 LoRA adapter...")
phase1_model = PeftModel.from_pretrained(base_model, PHASE1_LORA_PATH)

print("Merging Phase 1 LoRA into base model...")
merged_phase1_model = phase1_model.merge_and_unload()

print("✓ Phase 1 knowledge merged into model")
print("✓ Model now includes: Base + CodeAlpaca training")

# Save merged model to disk (needed for quantized reload)
print(f"\nSaving merged model to {MERGED_TEMP_PATH}...")
merged_phase1_model.save_pretrained(MERGED_TEMP_PATH)

print("\n" + "="*60)
print("✓ Merged model saved")
print("✓ Memory cleared")

# Clean up to free memory
print("\nCleaning up original models from memory...")
del base_model, phase1_model, merged_phase1_model
torch.cuda.empty_cache()

## Step 1: Merge Phase 1 LoRA with Base Model

Before training Phase 2, we merge the Phase 1 LoRA adapter into the base model, then save it. This merged model contains all the CodeAlpaca knowledge from Phase 1.

**Critical Fix:** The merged model is saved to disk and then reloaded with 4-bit quantization. This prevents device placement issues and enables proper LoRA training.

In [None]:
# Load tokenizer with padding token verification
print("="*60)
print("STEP 2: LOADING TOKENIZER")
print("="*60)
print("\nLoading tokenizer from base model...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)

# CRITICAL: Force padding token to match Phase 1 exactly
# Phase 1 used pad_token_id = 151645 (eos_token)
print("⚠️  Forcing pad_token to eos_token (ID: 151645) to match Phase 1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = 151645
tokenizer.padding_side = "right"

print(f"\n🔍 VERIFICATION:")
print(f"   pad_token_id: {tokenizer.pad_token_id}")
print(f"   eos_token_id: {tokenizer.eos_token_id}")

# Phase 1 used pad_token_id = 151645
if tokenizer.pad_token_id == 151645:
    print(f"   ✅ CORRECT: Matches Phase 1 configuration (ID: 151645)")
else:
    print(f"   ⚠️  WARNING: Different from Phase 1 (expected 151645, got {tokenizer.pad_token_id})")
    print(f"   This may cause training inconsistencies!")

print("="*60)

In [None]:
# Reload merged model with 4-bit quantization for training
print("\n" + "="*60)
print("STEP 3: RELOADING MERGED MODEL WITH 4-BIT QUANTIZATION")
print("="*60)

from transformers import BitsAndBytesConfig

print("\nConfiguring 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print(f"Loading merged model from {MERGED_TEMP_PATH} with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    MERGED_TEMP_PATH,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Preparing quantized model for LoRA training...")
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

print("="*60)
print("✓ Model ready for Phase 2 LoRA training")
print(f"✓ Gradient checkpointing enabled (saves memory)")
print(f"✓ Model loaded and quantized on device: {model.device}")

In [None]:
# Configure LoRA for Phase 2 training
print("\n" + "="*60)
print("STEP 4: CONFIGURING PHASE 2 LORA")
print("="*60)
print("\nSetting up LoRA for Linux command training...")
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("="*60)

## Step 5: Load and Prepare Datasets

We'll load multiple datasets and combine them:
1. **Kaggle datasets** - Add as data sources in notebook settings
2. **HuggingFace datasets** - Load directly (requires internet enabled)

In [None]:
# Load Kaggle datasets (add these as data sources in notebook settings)
print("Loading Kaggle datasets...")

# Dataset 1: Complex Linux commands from natural language
try:
    df_complex = pd.read_json('/kaggle/input/complex-linux-commands-from-natual-language/complex_linux_commands_million.json')
    print(f"✓ Complex Linux commands: {len(df_complex)} examples")
except Exception as e:
    print(f"! Could not load complex commands: {e}")
    df_complex = None

# Dataset 2: Linux terminal commands
try:
    df_terminal = pd.read_json('/kaggle/input/linux-terminal-commands-dataset/LINUX_TERMINAL_COMMANDS.jsonl', lines=True)
    print(f"✓ Terminal commands: {len(df_terminal)} examples")
except Exception as e:
    print(f"! Could not load terminal commands: {e}")
    df_terminal = None

In [None]:
# Load HuggingFace datasets (requires internet enabled in notebook settings)
print("\nLoading HuggingFace datasets...")

# Dataset 3: Bash commands dataset
try:
    hf_bash = load_dataset("aelhalili/bash-commands-dataset", split="train")
    print(f"✓ Bash commands: {len(hf_bash)} examples")
except Exception as e:
    print(f"! Could not load bash commands: {e}")
    hf_bash = None

# Dataset 4: CodeFeedback filtered for bash/shell
try:
    print("Loading CodeFeedback (filtering for bash/shell)...")
    codefeedback = load_dataset("m-a-p/CodeFeedback-Filtered-Instruction", split="train")
    
    # Filter for bash/shell examples
    def is_bash_shell(example):
        query = example.get('query', '').lower()
        answer = example.get('answer', '').lower()
        return any(keyword in query or keyword in answer 
                  for keyword in ['bash', 'shell', 'linux', 'command', 'terminal', 'script'])
    
    hf_codefeedback = codefeedback.filter(is_bash_shell)
    print(f"✓ CodeFeedback bash/shell: {len(hf_codefeedback)} examples (filtered from {len(codefeedback)})")
except Exception as e:
    print(f"! Could not load CodeFeedback: {e}")
    hf_codefeedback = None

In [None]:
# Normalize and combine all datasets
print("\nNormalizing datasets to common format...")

def normalize_kaggle_complex(df):
    """Normalize complex Linux commands dataset"""
    if df is None:
        return None
    
    # Sample if too large (we want ~30-40K total)
    if len(df) > 40000:
        df = df.sample(n=40000, random_state=42)
    
    # Assuming format: {"instruction": "...", "command": "..."}
    data = []
    for _, row in df.iterrows():
        # Adjust these field names based on actual dataset structure
        instruction = row.get('instruction', row.get('query', row.get('nl', '')))
        command = row.get('command', row.get('cmd', row.get('bash', '')))
        
        text = f"Instruction: {instruction}\n\nResponse: {command}"
        data.append({"text": text})
    
    return Dataset.from_dict({"text": [d["text"] for d in data]})

def normalize_kaggle_terminal(df):
    """Normalize terminal commands dataset"""
    if df is None:
        return None
    
    data = []
    for _, row in df.iterrows():
        # Adjust these field names based on actual dataset structure
        instruction = row.get('instruction', row.get('query', row.get('description', '')))
        command = row.get('command', row.get('cmd', ''))
        
        text = f"Instruction: {instruction}\n\nResponse: {command}"
        data.append({"text": text})
    
    return Dataset.from_dict({"text": [d["text"] for d in data]})

def normalize_hf_bash(dataset):
    """Normalize HuggingFace bash commands"""
    if dataset is None:
        return None
    
    def format_bash(example):
        instruction = example.get('instruction', example.get('input', example.get('question', '')))
        command = example.get('output', example.get('command', example.get('answer', '')))
        return {"text": f"Instruction: {instruction}\n\nResponse: {command}"}
    
    return dataset.map(format_bash, remove_columns=dataset.column_names)

def normalize_hf_codefeedback(dataset):
    """Normalize CodeFeedback bash/shell examples"""
    if dataset is None:
        return None
    
    def format_feedback(example):
        query = example.get('query', '')
        answer = example.get('answer', '')
        return {"text": f"Instruction: {query}\n\nResponse: {answer}"}
    
    return dataset.map(format_feedback, remove_columns=dataset.column_names)

# Normalize all datasets
datasets_normalized = []

if df_complex is not None:
    ds1 = normalize_kaggle_complex(df_complex)
    if ds1:
        datasets_normalized.append(ds1)
        print(f"✓ Normalized complex commands: {len(ds1)}")

if df_terminal is not None:
    ds2 = normalize_kaggle_terminal(df_terminal)
    if ds2:
        datasets_normalized.append(ds2)
        print(f"✓ Normalized terminal commands: {len(ds2)}")

if hf_bash is not None:
    ds3 = normalize_hf_bash(hf_bash)
    if ds3:
        datasets_normalized.append(ds3)
        print(f"✓ Normalized bash commands: {len(ds3)}")

if hf_codefeedback is not None:
    ds4 = normalize_hf_codefeedback(hf_codefeedback)
    if ds4:
        datasets_normalized.append(ds4)
        print(f"✓ Normalized CodeFeedback: {len(ds4)}")

In [None]:
# Combine all normalized datasets
if not datasets_normalized:
    raise ValueError("No datasets loaded successfully! Check data sources and internet connection.")

print(f"\nCombining {len(datasets_normalized)} datasets...")
dataset = concatenate_datasets(datasets_normalized)
print(f"✓ Combined dataset size: {len(dataset)}")

# CRITICAL: Cap dataset size to avoid OOM errors
# Phase 1 used 20K, we'll use 25K for Phase 2 (to fit in T4 memory)
MAX_DATASET_SIZE = 25000
if len(dataset) > MAX_DATASET_SIZE:
    print(f"⚠️  Dataset too large ({len(dataset)} examples)")
    print(f"⚠️  Sampling {MAX_DATASET_SIZE} examples to fit in GPU memory...")
    dataset = dataset.shuffle(seed=42).select(range(MAX_DATASET_SIZE))
    print(f"✓ Reduced to {len(dataset)} examples")
else:
    # Shuffle for better training
    dataset = dataset.shuffle(seed=42)
    print(f"✓ Dataset shuffled")

print(f"\n✓ Final training size: {len(dataset)} examples")

# Show sample
print("\nSample training example:")
print(dataset[0]['text'][:500])

In [None]:
# Training arguments (using SFTConfig for SFT-specific parameters)
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    fp16=True,
    save_strategy="epoch",
    logging_steps=50,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    report_to="none",
    max_grad_norm=0.3,
    # SFT-specific parameters
    max_length=MAX_SEQ_LENGTH,
    dataset_text_field="text",
    packing=False,
)

In [None]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    processing_class=tokenizer,
    args=training_args,
)

print("Trainer initialized. Starting training...")
print(f"Starting training on {len(dataset)} examples...")

In [None]:
# Train!
trainer.train()

In [None]:
# Save the fine-tuned model
print("Saving model...")
trainer.model.save_pretrained(OUTPUT_DIR + "/final")
tokenizer.save_pretrained(OUTPUT_DIR + "/final")
print(f"✓ Model saved to {OUTPUT_DIR}/final")

In [None]:
# Merge LoRA adapter into base model for next phase
print("\n" + "="*60)
print("MERGING LORA ADAPTER INTO BASE MODEL")
print("="*60)

# Reload the trained model in full precision for merging
print("\nLoading trained model for merging...")
from peft import PeftModel

# Load base model (full precision)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Load the LoRA adapter we just trained
lora_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR + "/final")

# Merge LoRA weights into base model
print("Merging LoRA adapter into base model...")
merged_model = lora_model.merge_and_unload()

# Save the merged model
MERGED_OUTPUT = OUTPUT_DIR + "/merged"
print(f"Saving merged model to {MERGED_OUTPUT}...")
merged_model.save_pretrained(MERGED_OUTPUT)
tokenizer.save_pretrained(MERGED_OUTPUT)

print(f"\n✓ Merged model saved to {MERGED_OUTPUT}")
print("✓ This merged model should be used as input for Phase 3")
print(f"✓ Pad token ID: {tokenizer.pad_token_id} (preserved for next phase)")

# Clean up to free memory
del base_model, lora_model, merged_model
torch.cuda.empty_cache()
print("✓ Memory cleared")

## Merge LoRA Adapter into Base Model

For sequential training (Phase 2 → Phase 3 → Phase 4), we need to merge the LoRA adapter into the base model so the next phase can build on this knowledge.

In [None]:
# Test the model
print("\nTesting the fine-tuned model...")
test_prompts = [
    "Instruction: How do I list all files including hidden ones?\n\nResponse:",
    "Instruction: Install nginx web server\n\nResponse:",
    "Instruction: Find all Python files in current directory\n\nResponse:",
]

for prompt in test_prompts:
    print(f"\n{'='*60}")
    print(f"PROMPT: {prompt}")
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=150, temperature=0.7)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"OUTPUT: {result}")

In [None]:
# Archive BOTH outputs for download
print("\nArchiving outputs...")
!zip -r qwen3-08b-phase2-linux-lora.zip {OUTPUT_DIR}/final
!zip -r qwen3-08b-phase2-linux-merged.zip {OUTPUT_DIR}/merged

print("\n" + "="*60)
print("PHASE 2 COMPLETE!")
print("="*60)
print("\n✓ LoRA adapter archived: qwen3-08b-phase2-linux-lora.zip")
print("✓ Merged model archived: qwen3-08b-phase2-linux-merged.zip")
print("\n⚠️  IMPORTANT: Upload the MERGED model as input for Phase 3")
print(f"⚠️  Pad token ID {tokenizer.pad_token_id} is preserved in merged model")