# 🚀 RLAIF Code Generation on SageMaker

Train a code generation model using Reinforcement Learning from AI Feedback!

## 1. Setup Environment

In [None]:
# Install required packages
!pip install sagemaker boto3 transformers trl datasets accelerate torch --upgrade

In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch
import boto3
from datetime import datetime

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

print(f"Role: {role}")
print(f"Bucket: {bucket}")

## 2. Quick Local Test

In [None]:
# Test the reward model locally
from reward_model import AIRewardModel

print("Testing AI Reward Model...")
reward_model = AIRewardModel("Salesforce/codet5-small")

# Test on good code
good_code = """def add_numbers(a, b):
    return a + b"""

reward = reward_model.evaluate_code_solution(
    "Write a function to add two numbers",
    good_code,
    "add_numbers(3, 5)",
    "8"
)
print(f"Good code reward: {reward:.3f}")

# Test on bad code
bad_code = "print('hello')"
bad_reward = reward_model.evaluate_code_solution(
    "Write a function to add two numbers",
    bad_code,
    "add_numbers(3, 5)",
    "8"
)
print(f"Bad code reward: {bad_reward:.3f}")

## 3. Launch SageMaker Training

In [None]:
# Configuration
MODEL_NAME = "Salesforce/codegen-350M-mono"  # Start small!
INSTANCE_TYPE = "ml.g4dn.xlarge"  # GPU instance
EPISODES = 20

# Create unique job name
job_name = f"rlaif-code-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
output_path = f"s3://{bucket}/rlaif-training/{job_name}"

print(f"Model: {MODEL_NAME}")
print(f"Instance: {INSTANCE_TYPE}")
print(f"Episodes: {EPISODES}")
print(f"Output: {output_path}")

In [None]:
# Create PyTorch estimator
estimator = PyTorch(
    entry_point="train_sagemaker.py",
    source_dir=".",  # Directory with your Python files
    role=role,
    instance_type=INSTANCE_TYPE,
    instance_count=1,
    framework_version="2.0",
    py_version="py310",
    hyperparameters={
        "model_name": MODEL_NAME,
        "episodes": EPISODES,
        "use_lora": True
    },
    output_path=output_path,
    base_job_name="rlaif-code",
    environment={
        "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512"
    }
)

print("Estimator created!")

In [None]:
# Start training (this will take 20-30 minutes for 350M model)
print("🚀 Starting training job...")
estimator.fit(wait=False)

print(f"\n✅ Training job submitted!")
print(f"Job name: {estimator.latest_training_job.name}")
print(f"\nMonitor progress in SageMaker console or run next cell")

## 4. Monitor Training

In [None]:
# Stream training logs (run this to see progress)
estimator.logs()

## 5. Download Trained Model

In [None]:
# Get model location (after training completes)
model_data = estimator.model_data
print(f"Model location: {model_data}")

# Download model
!aws s3 cp {model_data} ./trained_model.tar.gz
!tar -xzf trained_model.tar.gz
print("Model downloaded!")

## 6. Test Trained Model

In [None]:
# Load and test the trained model
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model
model_path = "./"  # Path where model was extracted
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Test generation
def generate_code(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True
        )
    return tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)

# Test prompts
test_prompts = [
    "Write a Python function that adds two numbers.",
    "Write a Python function that checks if a number is even.",
    "Write a Python function that reverses a string."
]

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    code = generate_code(prompt)
    print(f"Generated:\n{code}")
    print("-" * 50)

## 📊 Expected Results

After 20-30 episodes with `codegen-350M-mono`:

**Before Training:**
- Random text or incomplete code
- Average reward: ~0.2-0.3

**After Training:**
- Valid Python functions
- Correct implementations for simple problems
- Average reward: ~0.7-0.9

## 💰 Cost Estimate

- **ml.g4dn.xlarge**: $0.736/hour
- **20 episodes**: ~30 minutes
- **Total cost**: ~$0.40

## 🚀 Next Steps

1. Try more episodes (50-100) for better results
2. Use larger models (CodeLlama-7B) with bigger instances
3. Expand the dataset with more complex problems
4. Fine-tune the reward model for specific code styles