# Phase 1B - Comprehensive Linux Commands Training

**Purpose:** Maximize A100 80GB capacity with comprehensive Linux command dataset

**Datasets:** 
- AnishJoshi/nl2bash-custom (24.6K)
- m-a-p/CodeFeedback-Filtered-Instruction (bash/shell)

- epinnock/intercode-nl2bash-curated (200)**Batch Size:** 16 (effective 64 with gradient accumulation)

- darkknight25/Linux_Terminal_Commands_Dataset (600)

- kushagragoyal060705/complex-linux-commands (sampled 50K)**Expected Time:** 5-6 hours on A100 80GB


**Total:** ~100K examples

In [None]:
# Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes trl torch

print("\n✅ Installation complete")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.6/564.6 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25h
✅ Installation complete


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.17 GB


In [None]:
# Configuration
MODEL_NAME = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B"
OUTPUT_DIR = "./qwen3-phase1b-linux-comprehensive"

# Training hyperparameters - OPTIMIZED FOR A100 80GB
BATCH_SIZE = 16  # Increased for A100 80GB
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 2048

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# CRITICAL: Same padding token
PAD_TOKEN_ID = 151645

print("✅ Configuration set")

✅ Configuration set


In [None]:
# Load tokenizer
print("Loading tokenizer from HuggingFace...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# CRITICAL: Force same padding token
tokenizer.pad_token_id = PAD_TOKEN_ID
tokenizer.padding_side = "right"

print(f"✅ Tokenizer loaded")
print(f"Pad token ID: {tokenizer.pad_token_id}")

Loading tokenizer from HuggingFace...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

✅ Tokenizer loaded
Pad token ID: 151645


In [None]:
# Load model with 4-bit quantization
print("="*60)
print("Loading Model from HuggingFace")
print("="*60)

print("\nConfiguring 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Preparing model for LoRA training...")
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

print(f"✅ Model loaded on device: {model.device}")

Loading Model from HuggingFace

Configuring 4-bit quantization...
Loading DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B...


config.json:   0%|          | 0.00/822 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Preparing model for LoRA training...
✅ Model loaded on device: cuda:0


In [None]:
# Configure LoRA
print("\nApplying LoRA configuration...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("\n✅ LoRA applied")
model.print_trainable_parameters()


Applying LoRA configuration...

✅ LoRA applied
trainable params: 6,881,280 || all params: 823,164,416 || trainable%: 0.8360


In [None]:
# Load and combine multiple Linux command datasets
print("="*60)
print("LOADING COMPREHENSIVE LINUX COMMAND DATASETS")
print("="*60)

from datasets import concatenate_datasets

print(f"\n✅ Successfully loaded {len(datasets_to_combine)} datasets")

# Dataset 1: nl2bash-custom (24.6K examples)datasets_to_combine = [d for d in [dataset1, dataset2, dataset3, dataset4, dataset5] if d is not None]

print("\n1. Loading AnishJoshi/nl2bash-custom...")# Combine successfully loaded datasets

dataset1 = load_dataset("AnishJoshi/nl2bash-custom", split="train")

print(f"   Loaded: {len(dataset1)} examples")print("="*60)

print("DATASET LOADING COMPLETE")

# Dataset 2: Complex Linux commands (sample 50K from 1M)print("\n" + "="*60)

print("\n2. Loading complex-linux-commands (sampling 50K)...")

try:    dataset5 = None

    dataset2 = load_dataset("kushagragoyal060705/complex-linux-commands-from-natual-language", split="train")    print(f"   ⚠️  Failed to load: {e}")

    dataset2 = dataset2.shuffle(seed=42).select(range(min(50000, len(dataset2))))except Exception as e:

    print(f"   Loaded: {len(dataset2)} examples")    print(f"   Loaded: {len(dataset5)} bash/shell examples")

except Exception as e:    dataset5 = dataset5_raw.filter(lambda x: 'bash' in str(x.get('language', '')).lower() or 'shell' in str(x.get('language', '')).lower())

    print(f"   ⚠️  Failed to load: {e}")    # Filter for bash/shell examples

    dataset2 = None    dataset5_raw = load_dataset("m-a-p/CodeFeedback-Filtered-Instruction", split="train")

try:

# Dataset 3: InterCode curated (200 examples)print("\n5. Loading m-a-p/CodeFeedback-Filtered-Instruction (bash/shell)...")

print("\n3. Loading epinnock/intercode-nl2bash-curated...")# Dataset 5: CodeFeedback bash/shell filtered

try:

    dataset3 = load_dataset("epinnock/intercode-nl2bash-curated", split="train")    dataset4 = None

    print(f"   Loaded: {len(dataset3)} examples")    print(f"   ⚠️  Failed to load: {e}")

except Exception as e:except Exception as e:

    print(f"   ⚠️  Failed to load: {e}")    print(f"   Loaded: {len(dataset4)} examples")

    dataset3 = None    dataset4 = load_dataset("darkknight25/Linux_Terminal_Commands_Dataset", split="train")

try:

# Dataset 4: Linux Terminal Commands (600 examples)print("\n4. Loading darkknight25/Linux_Terminal_Commands_Dataset...")

Loading Linux commands dataset...


README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.json: 0.00B [00:00, ?B/s]

dev.json: 0.00B [00:00, ?B/s]

test.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/19658 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2457 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2458 [00:00<?, ? examples/s]

Dataset size: 19658 examples

Sample entry:
{'bash_code': 'find -mtime +2 -mtime -5', 'nl_command': 'Look for any files that were modified 2-5 days ago', 'srno': 3194}


In [None]:
# Normalize all datasets to common format
def normalize_dataset(example):
    """Normalize different dataset formats to common structure"""
    # Try different field name combinations
    description = (
        example.get('description') or 
        example.get('question') or 
        example.get('instruction') or 
        example.get('nl') or
        example.get('query') or
        ''
    )

    print(dataset[0]['text'][:500])

    command = (print("\n📝 Sample formatted example:")

        example.get('cmd') or 

        example.get('answer') or print("="*60)

        example.get('output') or print(f"FINAL TRAINING DATASET: {len(dataset):,} examples")

        example.get('bash') orprint("\n" + "="*60)

        example.get('command') or

        ''dataset = dataset.shuffle(seed=42)

    )print("\nShuffling combined dataset...")

    # Shuffle for good measure

    prompt = f"Instruction: {description}\n\nResponse:"

    text = f"{prompt} {command}"print(f"\n✅ TOTAL COMBINED: {len(dataset)} examples")

    return {"text": text}dataset = concatenate_datasets(normalized_datasets)

print("\nCombining datasets...")

print("\nNormalizing datasets to common format...")# Combine all normalized datasets

normalized_datasets = []

for i, ds in enumerate(datasets_to_combine, 1):    print(f"    ✅ {len(normalized)} examples formatted")

    print(f"  Normalizing dataset {i}...")    normalized_datasets.append(normalized)
    normalized = ds.map(normalize_dataset, remove_columns=ds.column_names)

Map:   0%|          | 0/19658 [00:00<?, ? examples/s]


Formatted example:
Instruction: 

Response: 


In [None]:
# Training arguments
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="none",
    dataset_text_field="text",
    packing=False,
)

print("✅ Training arguments configured")

✅ Training arguments configured


In [None]:
# Create trainer and start training
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

print(f"\nTraining on {len(dataset)} examples...")
print(f"Total steps: {len(dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * NUM_EPOCHS}")
print("\nStarting training...\n")

trainer.train()

print("\n" + "="*60)
print("✅ TRAINING COMPLETE!")
print("="*60)


STARTING TRAINING


Adding EOS to train dataset:   0%|          | 0/19658 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/19658 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/19658 [00:00<?, ? examples/s]


Training on 19658 examples...
Total steps: 3684

Starting training...



Step,Training Loss
10,6.6264
20,5.1648


In [None]:
# Move LoRA parameters to GPU
model = model.to("cuda")
print(f"✅ Model moved to GPU: {next(model.parameters()).device}")

In [None]:
# Save the trained LoRA adapter
print("\nSaving LoRA adapter...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

import os
adapter_path = f"{OUTPUT_DIR}/adapter_model.safetensors"
if os.path.exists(adapter_path):
    adapter_size = os.path.getsize(adapter_path) / 1e6
    print(f"✅ Adapter saved: {adapter_size:.1f} MB")
else:
    print(f"❌ ERROR: Adapter file not found")

# Summary

**Colab vs Kaggle Comparison:**

Record your observations:
- Setup time
- Training time
- Ease of use
- GPU type assigned
- Any errors encountered
- Overall experience