# üöÄ ZeroLang Full Pipeline

**Automatic data collection + training in one notebook**

1. Collect 2000+ C‚ÜíWAT training pairs
2. Train Qwen2.5-Coder-14B model
3. Test and export

**Total time: ~3-4 hours**
- Data collection: ~2 hours
- Training: ~1-2 hours

**Requirements:**
- H100 GPU (or A100 for 7B model)
- Colab Pro+ recommended

## ‚öôÔ∏è Configuration

In [None]:
#@title Configuration { display-mode: "form" }

#@markdown ### Data Collection
TARGET_PAIRS = 2000  #@param {type:"integer"}
MAX_REPOS = 50  #@param {type:"integer"}

#@markdown ### Training
MODEL = "qwen-coder-14b"  #@param ["qwen-coder-7b", "qwen-coder-14b", "qwen-coder-32b"]
EPOCHS = 10  #@param {type:"integer"}
BATCH_SIZE = 8  #@param {type:"integer"}
MAX_LENGTH = 2048  #@param {type:"integer"}

#@markdown ### Output
SAVE_TO_DRIVE = True  #@param {type:"boolean"}

print(f"Target: {TARGET_PAIRS} pairs from {MAX_REPOS} repos")
print(f"Model: {MODEL}, Epochs: {EPOCHS}")

## 1Ô∏è‚É£ Setup Environment

In [None]:
# Check GPU
!nvidia-smi --query-gpu=name,memory.total --format=csv

import torch
print(f"\nPyTorch CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
%%bash
# Install LLVM with WASM support
echo "[1/4] Installing LLVM toolchain..."
apt-get update -qq
apt-get install -qq -y llvm lld clang > /dev/null 2>&1

# Verify WASM target works
echo "[2/4] Testing WASM compilation..."
echo 'int add(int a, int b) { return a + b; }' > /tmp/test.c
clang --target=wasm32 -nostdlib -Wl,--no-entry -Wl,--export-all -fuse-ld=lld -o /tmp/test.wasm /tmp/test.c 2>/dev/null
if [ -f /tmp/test.wasm ]; then
    echo "‚úì WASM compilation works!"
else
    echo "‚úó WASM compilation failed - trying alternative..."
    # Try with explicit lld path
    clang --target=wasm32 -nostdlib -Wl,--no-entry -Wl,--export-all -fuse-ld=/usr/bin/lld -o /tmp/test.wasm /tmp/test.c
fi

# Install wasm-tools from bytecodealliance
echo "[3/4] Installing wasm-tools..."
cd /tmp
curl -sLO https://github.com/bytecodealliance/wasm-tools/releases/download/v1.244.0/wasm-tools-1.244.0-x86_64-linux.tar.gz
tar -xzf wasm-tools-1.244.0-x86_64-linux.tar.gz
cp wasm-tools-1.244.0-x86_64-linux/wasm-tools /usr/local/bin/
chmod +x /usr/local/bin/wasm-tools

# Verify wasm-tools
echo "[4/4] Testing wasm-tools..."
wasm-tools print /tmp/test.wasm > /dev/null 2>&1
if [ $? -eq 0 ]; then
    echo "‚úì wasm-tools works!"
    wasm-tools --version
else
    echo "‚úó wasm-tools failed"
fi

echo ""
echo "=== Environment Ready ==="

In [None]:
# Install Python dependencies
!pip install -q transformers datasets peft accelerate bitsandbytes

In [None]:
# Clone repository
!rm -rf zerolang  # Clean if exists
!git clone --depth=1 https://github.com/project-zero-git/zerolang.git
%cd zerolang
!ls -la

In [None]:
# Update generator.py to use system clang (not homebrew path)
!sed -i 's|/opt/homebrew/opt/llvm/bin/clang|clang|g' pipeline/generator.py

# Quick test: compile a single function
print("Testing pipeline with a single function...")
!echo 'int multiply(int a, int b) { return a * b; }' > /tmp/single_test.c
!clang --target=wasm32 -O2 -nostdlib -fuse-ld=lld -Wl,--no-entry -Wl,--export-all -o /tmp/single_test.wasm /tmp/single_test.c && wasm-tools print /tmp/single_test.wasm | head -20
print("\n‚úì Pipeline test passed!")

In [None]:
# Mount Google Drive (optional - for saving model)
if SAVE_TO_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_OUTPUT = '/content/drive/MyDrive/zerolang_models'
    !mkdir -p {DRIVE_OUTPUT}
    print(f"Models will be saved to: {DRIVE_OUTPUT}")
else:
    DRIVE_OUTPUT = None
    print("Drive not mounted - model will be saved locally only")

## 2Ô∏è‚É£ Data Collection

Collects C functions from GitHub repos and compiles to WAT

In [None]:
# Extended repository list for more data
REPOS = '''# Algorithms & Data Structures
https://github.com/TheAlgorithms/C
https://github.com/fragglet/c-algorithms
https://github.com/attractivechaos/klib
https://github.com/srdja/Collections-C
https://github.com/troydhanson/uthash

# Cryptography
https://github.com/B-Con/crypto-algorithms
https://github.com/kokke/tiny-AES-c
https://github.com/ctz/cifra
https://github.com/983/SHA-256
https://github.com/983/Num

# String & Text
https://github.com/sheredom/utf8.h
https://github.com/antirez/sds
https://github.com/jwerle/murmurhash.c
https://github.com/skeeto/branchless-utf8

# JSON & Parsing
https://github.com/DaveGamble/cJSON
https://github.com/zserge/jsmn
https://github.com/kgabis/parson
https://github.com/cesanta/frozen

# Compression
https://github.com/lz4/lz4
https://github.com/richgel999/miniz

# Math & Numerical
https://github.com/nothings/stb
https://github.com/983/fft
https://github.com/skeeto/hash-prospector
https://github.com/lemire/clhash

# Utilities
https://github.com/antirez/linenoise
https://github.com/rxi/vec
https://github.com/rxi/map
https://github.com/rxi/log.c
https://github.com/skeeto/optparse
https://github.com/gingerBill/gb

# Additional algorithm repos
https://github.com/tezc/sc
https://github.com/tidwall/hashmap.c
https://github.com/sheredom/hashmap.h
https://github.com/tidwall/btree.c
https://github.com/antirez/rax
https://github.com/clibs/buffer
https://github.com/clibs/list
'''

# Save to file
with open('pipeline/repos_extended.txt', 'w') as f:
    f.write(REPOS)

repo_list = [l.strip() for l in REPOS.strip().split('\n') if l.strip() and not l.startswith('#')]
print(f"Total repos to process: {len(repo_list)}")
for i, r in enumerate(repo_list, 1):
    print(f"  {i}. {r.split('/')[-1]}")

In [None]:
%%time
# Run data collection
import os
os.makedirs('data', exist_ok=True)

print(f"Collecting data from repos...")
print("This will take ~1-2 hours...\n")
print("="*60)

!python pipeline/generator.py \
    -l pipeline/repos_extended.txt \
    -o data/colab_training.jsonl \
    --verbose \
    2>&1 | tee data/collection.log | grep -E '(SUCCESS|Processing:|pairs_generated|==)'

In [None]:
# Check collected data
import json
from pathlib import Path

data_file = Path('data/colab_training.jsonl')
if not data_file.exists():
    print("ERROR: No data collected!")
    print("Check data/collection.log for errors")
else:
    with open(data_file) as f:
        pairs = [json.loads(l) for l in f if l.strip()]
    
    print(f"="*60)
    print(f"Data Collection Results")
    print(f"="*60)
    print(f"Total pairs collected: {len(pairs)}")
    
    if pairs:
        avg_instr = sum(len(p['instruction']) for p in pairs) / len(pairs)
        avg_wat = sum(len(p['output']) for p in pairs) / len(pairs)
        print(f"Avg instruction length: {avg_instr:.0f} chars")
        print(f"Avg WAT length: {avg_wat:.0f} chars")
        
        # Sample
        print(f"\n--- Sample pair ---")
        print(f"Instruction: {pairs[0]['instruction'][:100]}...")
        print(f"WAT preview: {pairs[0]['output'][:200]}...")

In [None]:
# Split into train/val and convert to ChatML format
if len(pairs) > 0:
    print("Splitting data...")
    !python pipeline/postprocess.py split data/colab_training.jsonl \
        --train data/train_colab.jsonl \
        --val data/val_colab.jsonl \
        --val-ratio 0.1
    
    print("\nConverting to ChatML format...")
    !python training/prepare_data.py data/train_colab.jsonl -o data/train_chatml_colab.jsonl -f chatml
    !python training/prepare_data.py data/val_colab.jsonl -o data/val_chatml_colab.jsonl -f chatml
    
    print("\nFinal data files:")
    !wc -l data/*_colab.jsonl
else:
    print("ERROR: No data to process. Check collection logs.")

## 3Ô∏è‚É£ Model Training

In [None]:
# Check data is ready before training
train_file = Path('data/train_chatml_colab.jsonl')
val_file = Path('data/val_chatml_colab.jsonl')

if not train_file.exists() or not val_file.exists():
    raise FileNotFoundError("Training data not found! Run data collection first.")

with open(train_file) as f:
    train_count = sum(1 for _ in f)
with open(val_file) as f:
    val_count = sum(1 for _ in f)

print(f"Training samples: {train_count}")
print(f"Validation samples: {val_count}")
print(f"\nReady to train {MODEL}!")

In [None]:
%%time
# Train model
print(f"Training {MODEL} for {EPOCHS} epochs...")
print(f"Batch size: {BATCH_SIZE}, Max length: {MAX_LENGTH}")
print("This will take ~1-2 hours...\n")

output_dir = f"models/zerolang-{MODEL}-colab"

!python training/train_cloud.py \
    --model {MODEL} \
    --data data \
    --epochs {EPOCHS} \
    --batch-size {BATCH_SIZE} \
    --max-length {MAX_LENGTH} \
    --output {output_dir}

## 4Ô∏è‚É£ Test Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Model mapping
BASE_MODELS = {
    "qwen-coder-7b": "Qwen/Qwen2.5-Coder-7B-Instruct",
    "qwen-coder-14b": "Qwen/Qwen2.5-Coder-14B-Instruct",
    "qwen-coder-32b": "Qwen/Qwen2.5-Coder-32B-Instruct",
}

model_path = f"models/zerolang-{MODEL}-colab"
base_model_name = BASE_MODELS[MODEL]

print(f"Loading trained model from {model_path}...")
tokenizer = AutoTokenizer.from_pretrained(model_path)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, model_path)
model.eval()
print("Model loaded successfully!")

In [None]:
def generate_wat(instruction, max_tokens=1024):
    """Generate WAT code from an instruction."""
    messages = [
        {"role": "system", "content": "You are ZeroLang, an AI that generates optimized WebAssembly (WAT) code. Output only valid WAT code."},
        {"role": "user", "content": instruction},
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.2,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

In [None]:
# Test with various prompts
test_prompts = [
    "Implement: int add(int a, int b)",
    "Implement: int factorial(int n)",
    "Implement: int max(int a, int b)",
    "Implement: int fibonacci(int n)",
    "Implement: void swap(int *a, int *b)",
]

print("Testing model outputs...\n")
for prompt in test_prompts:
    print(f"{'='*60}")
    print(f"Input: {prompt}")
    print('='*60)
    wat = generate_wat(prompt)
    # Show first 600 chars
    print(wat[:600] if len(wat) > 600 else wat)
    print()

## 5Ô∏è‚É£ Save & Export

In [None]:
# Save to Google Drive
if SAVE_TO_DRIVE and DRIVE_OUTPUT:
    import shutil
    output_name = f"zerolang-{MODEL}-colab"
    
    print(f"Saving model to Google Drive...")
    
    # Copy model
    drive_model_path = f"{DRIVE_OUTPUT}/{output_name}"
    if Path(drive_model_path).exists():
        shutil.rmtree(drive_model_path)
    shutil.copytree(f"models/{output_name}", drive_model_path)
    
    # Copy training data
    shutil.copy("data/colab_training.jsonl", f"{DRIVE_OUTPUT}/training_data.jsonl")
    
    print(f"‚úÖ Saved to Google Drive: {DRIVE_OUTPUT}")
    !ls -la {DRIVE_OUTPUT}
else:
    print("Google Drive not mounted. Model saved locally only.")

In [None]:
# Download as zip (alternative)
import os
output_name = f"zerolang-{MODEL}-colab"

!cd models && zip -r ../{output_name}.zip {output_name}

# Include training data in a separate zip
!zip -j training_data.zip data/colab_training.jsonl data/train_chatml_colab.jsonl data/val_chatml_colab.jsonl

print(f"\nCreated zip files:")
!ls -lh *.zip

# Uncomment to download
# from google.colab import files
# files.download(f'{output_name}.zip')

## üìä Summary

In [None]:
print("="*60)
print("üéâ ZeroLang Training Complete!")
print("="*60)

# Load stats
with open('data/colab_training.jsonl') as f:
    final_pairs = sum(1 for _ in f)

print(f"\nüìä Results:")
print(f"  - Data collected: {final_pairs} pairs")
print(f"  - Model: {MODEL}")
print(f"  - Epochs: {EPOCHS}")
print(f"\nüìÅ Output:")
print(f"  - Local: models/zerolang-{MODEL}-colab")
if SAVE_TO_DRIVE and DRIVE_OUTPUT:
    print(f"  - Drive: {DRIVE_OUTPUT}")

print(f"\nüöÄ Next steps:")
print(f"  1. Download the model zip")
print(f"  2. Use training/inference.py to test locally")
print(f"  3. Integrate into zrun runtime")