# üöÄ Step 2: Train Model (H100 GPU)

**REQUIRES GPU** - Run this with H100 runtime!

This notebook:
1. Loads data from Google Drive (from Step 1)
2. Trains Qwen2.5-Coder-14B with LoRA
3. Exports trained model

**Time:** ~1 hour
**Cost:** ~$4 (H100)

**Prerequisites:** Run **Step1_Collect_Data.ipynb** first!

In [None]:
#@title Configuration { display-mode: "form" }

MODEL = "qwen-coder-14b"  #@param ["qwen-coder-7b", "qwen-coder-14b", "qwen-coder-32b"]
EPOCHS = 5  #@param {type:"integer"}
BATCH_SIZE = 8  #@param {type:"integer"}
MAX_LENGTH = 2048  #@param {type:"integer"}
LEARNING_RATE = 2e-4  #@param {type:"number"}

print(f"Model: {MODEL}")
print(f"Epochs: {EPOCHS}, Batch: {BATCH_SIZE}, Max Length: {MAX_LENGTH}")

In [None]:
# Check GPU
!nvidia-smi --query-gpu=name,memory.total --format=csv

import torch
if not torch.cuda.is_available():
    raise RuntimeError("‚ùå No GPU found! Change runtime to H100.")

gpu_name = torch.cuda.get_device_name(0)
gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"\n‚úì GPU: {gpu_name} ({gpu_mem:.0f} GB)")

if gpu_mem < 40:
    print("‚ö†Ô∏è Warning: Less than 40GB VRAM. Consider using qwen-coder-7b.")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

DRIVE_DATA = '/content/drive/MyDrive/zerolang_data'
DRIVE_OUTPUT = '/content/drive/MyDrive/zerolang_models'

!mkdir -p {DRIVE_OUTPUT}

# Check data exists
import os
required_files = ['train_chatml_large.jsonl', 'val_chatml_large.jsonl']
missing = [f for f in required_files if not os.path.exists(f"{DRIVE_DATA}/{f}")]

if missing:
    print(f"‚ùå Missing files in Google Drive: {missing}")
    print(f"\nPlease run Step1_Collect_Data.ipynb first!")
    raise FileNotFoundError("Training data not found")
else:
    print(f"‚úì Training data found in {DRIVE_DATA}")
    !ls -lh {DRIVE_DATA}/*.jsonl

In [None]:
# Install dependencies
!pip install -q transformers datasets peft accelerate bitsandbytes

In [None]:
# Clone repo and setup
!rm -rf /content/zerolang
!git clone --depth=1 https://github.com/project-zero-git/zerolang.git /content/zerolang
%cd /content/zerolang

# Copy data from Drive to local (faster training)
!mkdir -p data
!cp {DRIVE_DATA}/train_chatml_large.jsonl data/
!cp {DRIVE_DATA}/val_chatml_large.jsonl data/

# Count samples
!echo "Training samples:" && wc -l data/train_chatml_large.jsonl
!echo "Validation samples:" && wc -l data/val_chatml_large.jsonl

In [None]:
# Update train_cloud.py to find our data files
# Add large dataset file names to the search list

train_script = open('training/train_cloud.py').read()
if 'train_chatml_large.jsonl' not in train_script:
    train_script = train_script.replace(
        '"train_chatml_colab.jsonl"',
        '"train_chatml_large.jsonl",\n        "train_chatml_colab.jsonl"'
    )
    train_script = train_script.replace(
        '"val_chatml_colab.jsonl"',
        '"val_chatml_large.jsonl",\n        "val_chatml_colab.jsonl"'
    )
    with open('training/train_cloud.py', 'w') as f:
        f.write(train_script)
    print("‚úì Updated train_cloud.py to find large dataset")

In [None]:
%%time
# Train!
output_dir = f"models/zerolang-{MODEL}-large"

print(f"Training {MODEL}...")
print(f"Output: {output_dir}")
print("="*60 + "\n")

!python training/train_cloud.py \
    --model {MODEL} \
    --data data \
    --epochs {EPOCHS} \
    --batch-size {BATCH_SIZE} \
    --max-length {MAX_LENGTH} \
    --lr {LEARNING_RATE} \
    --output {output_dir}

In [None]:
# Test the model
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

BASE_MODELS = {
    "qwen-coder-7b": "Qwen/Qwen2.5-Coder-7B-Instruct",
    "qwen-coder-14b": "Qwen/Qwen2.5-Coder-14B-Instruct",
    "qwen-coder-32b": "Qwen/Qwen2.5-Coder-32B-Instruct",
}

model_path = f"models/zerolang-{MODEL}-large"
base_model_name = BASE_MODELS[MODEL]

print(f"Loading model from {model_path}...")
tokenizer = AutoTokenizer.from_pretrained(model_path)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, model_path)
model.eval()
print("‚úì Model loaded!")

In [None]:
def generate(prompt):
    messages = [
        {"role": "system", "content": "You are ZeroLang, an AI that generates WebAssembly (WAT) code."},
        {"role": "user", "content": prompt},
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=512, temperature=0.2, do_sample=True, pad_token_id=tokenizer.pad_token_id)
    
    return tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

# Test prompts
tests = [
    "Implement: int add(int a, int b)",
    "Implement: int factorial(int n)",
    "Implement: int max(int a, int b)",
]

for prompt in tests:
    print(f"\n{'='*50}")
    print(f">>> {prompt}")
    print('='*50)
    result = generate(prompt)
    print(result[:500] if len(result) > 500 else result)

In [None]:
# Save to Google Drive
import shutil

output_name = f"zerolang-{MODEL}-large"
drive_model_path = f"{DRIVE_OUTPUT}/{output_name}"

print(f"Saving model to Google Drive...")
if os.path.exists(drive_model_path):
    shutil.rmtree(drive_model_path)
shutil.copytree(f"models/{output_name}", drive_model_path)

print(f"\n" + "="*60)
print(f"‚úÖ Model saved to: {drive_model_path}")
print("="*60)
!ls -la {drive_model_path}

In [None]:
# Create downloadable zip
!cd models && zip -r ../zerolang-model-large.zip {output_name}
print(f"\nZip file created: zerolang-model-large.zip")
!ls -lh zerolang-model-large.zip

# Uncomment to download:
# from google.colab import files
# files.download('zerolang-model-large.zip')

In [None]:
# Final summary
print("="*60)
print("üéâ Training Complete!")
print("="*60)
print(f"\nModel: {MODEL}")
print(f"Epochs: {EPOCHS}")
print(f"\nSaved to:")
print(f"  - Google Drive: {drive_model_path}")
print(f"  - Zip: zerolang-model-large.zip")
print(f"\nTo use locally:")
print(f"  python scripts/download_and_test.py --model-path <path-to-model>")