# SQL Codegen SLM - Training Notebook

Fine-tune Mistral-7B for PostgreSQL query generation.

**Data:** `gs://sql-codegen-slm-data/data/`

## 1. Check GPU

In [None]:
!nvidia-smi

import torch
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
else:
    print('No GPU!')

# Configuration - Set these in Colab secrets or environment
import os
from google.colab import userdata

# Try to get from Colab secrets first, then fall back to defaults
try:
    PROJECT_ID = userdata.get('GCP_PROJECT_ID')
except:
    PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'your-gcp-project-id')

BUCKET_NAME = os.environ.get('GCS_BUCKET', 'sql-codegen-slm-data')
REPO_URL = os.environ.get('REPO_URL', 'https://github.com/rajeshmr/sql-codegen-slm.git')

# Set environment variables
os.environ['GCP_PROJECT_ID'] = PROJECT_ID
os.environ['GCS_BUCKET'] = BUCKET_NAME

print(f"Project ID: {PROJECT_ID}")
print(f"Bucket: {BUCKET_NAME}")
print(f"Repo: {REPO_URL}")

In [None]:
# Authenticate GCS (required for bucket access)
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {PROJECT_ID}
print(f'âœ… Authenticated: {PROJECT_ID}')

# Clone public repo (no authentication needed)
import os

if not os.path.exists('sql-codegen-slm'):
    !git clone {REPO_URL}
else:
    print("Repository already exists")
    
%cd sql-codegen-slm
!git pull

## 4. Install Dependencies

In [None]:
!pip install -q -r training/requirements.txt
print('Dependencies installed')

## 5. Download Data from GCS

In [None]:
!mkdir -p /content/data /content/models /content/logs /content/tensorboard
!gsutil -m cp gs://{BUCKET_NAME}/data/*.jsonl /content/data/
!wc -l /content/data/*.jsonl

## 6. Verify Environment

In [None]:
from training.colab_setup import check_gpu, estimate_training_time
check_gpu()
estimate_training_time()

## 7. Start Training

~8-12 hours on A100

In [None]:
!python -m training.train --config training/configs/mistral_lora_config.yaml

## 8. Sync to GCS

In [None]:
!gsutil -m rsync -r /content/models gs://{BUCKET_NAME}/models/
print(f'Synced to gs://{BUCKET_NAME}/models/')

## 9. TensorBoard

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/tensorboard