# SQL Codegen SLM - Full Production Training

**Fine-tune Mistral-7B for PostgreSQL query generation**

## Training Configuration
- **Dataset**: 6,016 train + 332 val examples
- **Epochs**: 3
- **Estimated time**: 8-12 hours on A100
- **Output**: `gs://sql-codegen-slm-data/models/mistral-sql-final`

## Pre-validated ‚úÖ
- GPU: A100-SXM4-40GB
- Memory: 13GB / 42.5GB (safe)
- Training pipeline verified
- GCS sync working

---
## 1. GPU Check

In [None]:
!nvidia-smi

import torch
print(f"\nPyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Memory: {gpu_mem:.1f} GB")
    
    if 'A100' in gpu_name:
        print("\n‚úÖ A100 detected - optimal for training")
    elif gpu_mem >= 16:
        print("\n‚úÖ GPU has sufficient memory")
    else:
        print("\n‚ö†Ô∏è GPU may have insufficient memory - consider A100")
else:
    raise RuntimeError("‚ùå No GPU detected! Enable GPU in Runtime > Change runtime type")

---
## 2. Configuration

In [None]:
import os
from datetime import datetime, timedelta

# Try Colab secrets first, then environment variables
try:
    from google.colab import userdata
    PROJECT_ID = userdata.get('GCP_PROJECT_ID')
except:
    PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'your-gcp-project-id')

# Configuration
BUCKET_NAME = os.environ.get('GCS_BUCKET', 'sql-codegen-slm-data')
REPO_URL = 'https://github.com/rajeshmr/sql-codegen-slm.git'
OUTPUT_DIR = '/content/models/mistral-sql-final'
GCS_OUTPUT = f'gs://{BUCKET_NAME}/models/mistral-sql-final'

# Set environment
os.environ['GCP_PROJECT_ID'] = PROJECT_ID
os.environ['GCS_BUCKET'] = BUCKET_NAME

# Estimate completion time
start_time = datetime.now()
estimated_hours = 10  # Conservative estimate
estimated_end = start_time + timedelta(hours=estimated_hours)

print("="*60)
print("TRAINING CONFIGURATION")
print("="*60)
print(f"Project ID: {PROJECT_ID}")
print(f"GCS Bucket: {BUCKET_NAME}")
print(f"Output: {GCS_OUTPUT}")
print(f"\nStart time: {start_time.strftime('%Y-%m-%d %H:%M')}")
print(f"Estimated completion: {estimated_end.strftime('%Y-%m-%d %H:%M')} (~{estimated_hours}h)")
print("="*60)

---
## 3. Authenticate GCS

In [None]:
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {PROJECT_ID}

# Verify bucket access
!gsutil ls gs://{BUCKET_NAME}/ | head -3
print(f"\n‚úÖ GCS authenticated: gs://{BUCKET_NAME}/")

---
## 4. Clone Repository

In [None]:
import os

if not os.path.exists('sql-codegen-slm'):
    !git clone {REPO_URL}
    print("‚úÖ Repository cloned")
else:
    print("Repository exists, pulling latest...")
    
%cd sql-codegen-slm
!git pull
print(f"\n‚úÖ Working directory: {os.getcwd()}")

---
## 5. Install Dependencies

In [None]:
!pip install -q -r training/requirements.txt

# Verify key packages
import transformers
import peft
import bitsandbytes

print(f"transformers: {transformers.__version__}")
print(f"peft: {peft.__version__}")
print(f"bitsandbytes: {bitsandbytes.__version__}")
print("\n‚úÖ Dependencies installed")

---
## 6. Download Training Data

In [None]:
import os

# Create directories
os.makedirs('/content/data', exist_ok=True)
os.makedirs('/content/models', exist_ok=True)
os.makedirs('/content/logs', exist_ok=True)
os.makedirs('/content/tensorboard', exist_ok=True)

# Download data from GCS
print("Downloading training data...")
!gsutil -m cp gs://{BUCKET_NAME}/data/train_postgres.jsonl /content/data/
!gsutil -m cp gs://{BUCKET_NAME}/data/val_postgres.jsonl /content/data/

# Verify
!echo "\nDataset sizes:"
!wc -l /content/data/*.jsonl

print("\n‚úÖ Data downloaded")

---
## 7. Create Full Training Config

In [None]:
import yaml

# Full production training configuration
full_config = {
    'model': {
        'name': 'mistralai/Mistral-7B-v0.1',
        'max_seq_length': 2048,
    },
    'lora': {
        'r': 16,
        'lora_alpha': 32,
        'lora_dropout': 0.05,
        'target_modules': [
            'q_proj',
            'k_proj', 
            'v_proj',
            'o_proj',
            'gate_proj',
            'up_proj',
            'down_proj'
        ],
        'bias': 'none',
        'task_type': 'CAUSAL_LM',
    },
    'quantization': {
        'load_in_4bit': True,
        'bnb_4bit_compute_dtype': 'bfloat16',
        'bnb_4bit_use_double_quant': True,
        'bnb_4bit_quant_type': 'nf4',
    },
    'training': {
        'output_dir': OUTPUT_DIR,
        'num_train_epochs': 3,
        'per_device_train_batch_size': 4,
        'per_device_eval_batch_size': 4,
        'gradient_accumulation_steps': 4,
        'gradient_checkpointing': True,
        'optim': 'paged_adamw_32bit',
        'learning_rate': 2e-4,
        'weight_decay': 0.001,
        'warmup_ratio': 0.03,
        'lr_scheduler_type': 'cosine',
        'max_grad_norm': 0.3,
        'fp16': False,
        'bf16': True,
        'logging_steps': 10,
        'save_strategy': 'steps',
        'save_steps': 500,
        'eval_strategy': 'steps',
        'eval_steps': 500,
        'save_total_limit': 3,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'eval_loss',
        'greater_is_better': False,
        'report_to': ['tensorboard'],
        'remove_unused_columns': False,
    },
    'data': {
        'train_file': '/content/data/train_postgres.jsonl',
        'val_file': '/content/data/val_postgres.jsonl',
        'max_samples_train': None,  # Use ALL data
        'max_samples_val': None,    # Use ALL data
    },
    'logging': {
        'log_dir': '/content/logs',
        'tensorboard_dir': '/content/tensorboard',
    },
    'gcs': {
        'bucket': BUCKET_NAME,
        'sync_checkpoints': True,
        'output_prefix': 'models/mistral-sql-final',
    },
}

# Save config
config_path = '/content/full_training_config.yaml'
with open(config_path, 'w') as f:
    yaml.dump(full_config, f, default_flow_style=False)

print("="*60)
print("FULL TRAINING CONFIGURATION")
print("="*60)
print(f"Model: {full_config['model']['name']}")
print(f"LoRA rank: {full_config['lora']['r']}")
print(f"LoRA modules: {len(full_config['lora']['target_modules'])} (all projections)")
print(f"Epochs: {full_config['training']['num_train_epochs']}")
print(f"Batch size: {full_config['training']['per_device_train_batch_size']}")
print(f"Gradient accumulation: {full_config['training']['gradient_accumulation_steps']}")
print(f"Effective batch size: {full_config['training']['per_device_train_batch_size'] * full_config['training']['gradient_accumulation_steps']}")
print(f"Learning rate: {full_config['training']['learning_rate']}")
print(f"Checkpoint every: {full_config['training']['save_steps']} steps")
print(f"\nConfig saved: {config_path}")
print("="*60)

---
## 8. Pre-flight Checks

In [None]:
import torch
import os

print("="*60)
print("PRE-FLIGHT CHECKS")
print("="*60)

checks_passed = True

# Check 1: GPU
if torch.cuda.is_available():
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)} ({gpu_mem:.0f}GB)")
else:
    print("‚ùå No GPU available")
    checks_passed = False

# Check 2: Training data
train_file = '/content/data/train_postgres.jsonl'
val_file = '/content/data/val_postgres.jsonl'
if os.path.exists(train_file) and os.path.exists(val_file):
    train_lines = sum(1 for _ in open(train_file))
    val_lines = sum(1 for _ in open(val_file))
    print(f"‚úÖ Training data: {train_lines} train, {val_lines} val examples")
else:
    print("‚ùå Training data not found")
    checks_passed = False

# Check 3: Config file
if os.path.exists('/content/full_training_config.yaml'):
    print("‚úÖ Config file ready")
else:
    print("‚ùå Config file not found")
    checks_passed = False

# Check 4: Disk space
import shutil
total, used, free = shutil.disk_usage('/content')
free_gb = free / (1024**3)
if free_gb > 50:
    print(f"‚úÖ Disk space: {free_gb:.0f}GB free")
else:
    print(f"‚ö†Ô∏è Low disk space: {free_gb:.0f}GB free")

# Check 5: Memory
torch.cuda.empty_cache()
allocated = torch.cuda.memory_allocated(0) / 1e9
print(f"‚úÖ GPU memory clear: {allocated:.2f}GB allocated")

# Check 6: GCS access
import subprocess
result = subprocess.run(['gsutil', 'ls', f'gs://{BUCKET_NAME}/'], capture_output=True, text=True)
if result.returncode == 0:
    print(f"‚úÖ GCS bucket accessible")
else:
    print("‚ùå GCS bucket not accessible")
    checks_passed = False

print("="*60)
if checks_passed:
    print("üöÄ ALL CHECKS PASSED - READY FOR TRAINING")
else:
    print("‚ùå SOME CHECKS FAILED - FIX BEFORE TRAINING")
print("="*60)

---
## 9. Start TensorBoard (Optional)

Run this cell to monitor training in real-time. You can also run it in a separate tab.

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/tensorboard

---
## 10. üöÄ START FULL TRAINING

**This will take 8-12 hours on A100.**

- Checkpoints saved every 500 steps to GCS
- Training can be resumed if disconnected
- Monitor progress in TensorBoard above

In [None]:
from datetime import datetime

print("="*60)
print("üöÄ STARTING FULL PRODUCTION TRAINING")
print("="*60)
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Estimated completion: ~8-12 hours")
print(f"Output: {GCS_OUTPUT}")
print("="*60)
print("")

!python -m training.train --config /content/full_training_config.yaml

print("")
print("="*60)
print(f"Training completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)

---
## 11. Sync Final Model to GCS

In [None]:
print("Syncing final model to GCS...")
!gsutil -m rsync -r {OUTPUT_DIR} {GCS_OUTPUT}/

print("\n" + "="*60)
print("‚úÖ MODEL SAVED")
print("="*60)
print(f"Location: {GCS_OUTPUT}/")
print("\nFiles:")
!gsutil ls -l {GCS_OUTPUT}/ | tail -10
print("="*60)

---
## 12. Test Trained Model

In [None]:
from training.model_utils import load_model_and_tokenizer
import yaml

# Load config
with open('/content/full_training_config.yaml') as f:
    config = yaml.safe_load(f)

# Update to load from trained checkpoint
config['model']['name'] = OUTPUT_DIR

print("Loading trained model...")
model, tokenizer = load_model_and_tokenizer(config)

# Test inference
test_prompt = """You are a PostgreSQL expert. Generate SQL for the following:

Database: ecommerce

Schema:
CREATE TABLE customers (customer_id SERIAL PRIMARY KEY, name VARCHAR(100), email VARCHAR(100));
CREATE TABLE orders (order_id SERIAL PRIMARY KEY, customer_id INTEGER REFERENCES customers(customer_id), total DECIMAL(10,2), created_at TIMESTAMP);

Question: Find the top 5 customers by total order value"""

inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("="*60)
print("MODEL INFERENCE TEST")
print("="*60)
print(generated)
print("="*60)

---
## 13. Training Summary

In [None]:
from datetime import datetime
import torch

print("="*60)
print("TRAINING COMPLETE - SUMMARY")
print("="*60)
print(f"Completion time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"")
print(f"Model location: {GCS_OUTPUT}/")
print(f"TensorBoard logs: gs://{BUCKET_NAME}/logs/")
print(f"")
print("Files saved:")
print("  - adapter_model.safetensors (LoRA weights)")
print("  - adapter_config.json")
print("  - tokenizer files")
print(f"")
print("GPU Memory used:")
print(f"  Max allocated: {torch.cuda.max_memory_allocated(0) / 1e9:.2f} GB")
print(f"  Max reserved: {torch.cuda.max_memory_reserved(0) / 1e9:.2f} GB")
print("="*60)
print("")
print("üéâ Training complete! Model ready for evaluation.")
print("")
print("Next steps:")
print("1. Download model: gsutil -m cp -r {GCS_OUTPUT}/ ./model/")
print("2. Run evaluation notebook")
print("3. Deploy to inference endpoint")