# ü§ñ The Autonomous Colony - GPU Training on Google Colab

Train RL agents with GPU acceleration.

**Setup:**
1. Runtime ‚Üí Change runtime type ‚Üí GPU (T4)
2. Run all cells in order
3. Models saved to Google Drive
4. Download for local visualization

## 1Ô∏è‚É£ Setup

In [None]:
# Check GPU
!nvidia-smi

import torch
print(f"\nPyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
MODEL_DIR = '/content/drive/MyDrive/autonomous_colony_models'
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"‚úì Models ‚Üí {MODEL_DIR}")

In [None]:
# Clone repo
!git clone https://github.com/ritikkumarv/autonomous-colony.git
%cd autonomous-colony

In [None]:
# Install dependencies
!pip install -q torch numpy matplotlib seaborn
print("‚úì Dependencies installed")

## 2Ô∏è‚É£ Configuration

In [None]:
CONFIG = {
    'n_agents': 2,           # Start with 2 agents
    'grid_size': 20,         # 20x20 grid
    'n_episodes': 500,       # 500 episodes (~1-2 hours on T4)
    'max_steps': 200,        # Steps per episode
    'save_interval': 100,    # Save every 100 episodes
    'agent_type': 'ppo',     # 'ppo' or 'dqn'
}

for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## 3Ô∏è‚É£ Training

In [None]:
import sys
sys.path.insert(0, '/content/autonomous-colony')

import torch
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from IPython.display import clear_output

from src.environment import ColonyEnvironment
from src.agents import PPOAgent, DQNAgent

# Create environment
env = ColonyEnvironment(
    n_agents=CONFIG['n_agents'],
    grid_size=CONFIG['grid_size']
)

# Create agent
if CONFIG['agent_type'] == 'ppo':
    agent = PPOAgent(
        grid_shape=(7, 7, 5),
        state_dim=5,
        action_dim=9,
        learning_rate=3e-4,
        n_epochs=10,
        batch_size=64
    )
else:  # dqn
    agent = DQNAgent(
        grid_shape=(7, 7, 5),
        state_dim=5,
        action_dim=9,
        learning_rate=1e-3,
        batch_size=64
    )

print(f"\n‚úì {CONFIG['agent_type'].upper()} Agent ready")
print(f"‚úì Device: {agent.device}")
print("\n" + "="*80)
print("TRAINING STARTED")
print("="*80)

In [None]:
# Training loop
episode_rewards = []
episode_lengths = []
success_rates = []

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for episode in range(CONFIG['n_episodes']):
    observations = env.reset()
    done = False
    step = 0
    episode_reward = 0
    
    # Episode rollout
    while not done and step < CONFIG['max_steps']:
        actions = []
        
        # Select actions for each agent
        for obs in observations:
            if CONFIG['agent_type'] == 'ppo':
                action, log_prob, value = agent.select_action(obs, training=True)
                
                # Store in rollout buffer
                agent.rollout_buffer.append({
                    'observation': obs,
                    'action': action,
                    'log_prob': log_prob,
                    'value': value
                })
            else:  # DQN
                action = agent.select_action(obs, training=True)
            
            actions.append(action)
        
        # Environment step
        next_observations, rewards, dones, truncated, info = env.step(actions)
        
        # Store rewards and next states
        if CONFIG['agent_type'] == 'ppo':
            # Update last entries in rollout buffer with rewards and next observations
            for i, (reward, next_obs, done_flag) in enumerate(zip(rewards, next_observations, dones)):
                idx = -(len(observations) - i)
                agent.rollout_buffer[idx]['reward'] = reward
                agent.rollout_buffer[idx]['next_observation'] = next_obs
                agent.rollout_buffer[idx]['done'] = done_flag
        else:  # DQN
            for obs, action, reward, next_obs, done_flag in zip(
                observations, actions, rewards, next_observations, dones
            ):
                agent.memory.push(obs, action, reward, next_obs, done_flag)
        
        episode_reward += sum(rewards)
        observations = next_observations
        done = truncated or all(dones)
        step += 1
    
    # Update agent
    if CONFIG['agent_type'] == 'ppo':
        if len(agent.rollout_buffer) >= agent.batch_size:
            agent.update()
    else:  # DQN
        if len(agent.memory) >= agent.batch_size:
            agent.update()
    
    # Track metrics
    episode_rewards.append(episode_reward)
    episode_lengths.append(step)
    success_rates.append(1 if episode_reward > 0 else 0)
    
    # Live plotting every 10 episodes
    if (episode + 1) % 10 == 0:
        clear_output(wait=True)
        
        # Rewards
        axes[0, 0].clear()
        axes[0, 0].plot(episode_rewards, alpha=0.3, color='blue')
        if len(episode_rewards) >= 50:
            ma = np.convolve(episode_rewards, np.ones(50)/50, mode='valid')
            axes[0, 0].plot(range(49, len(episode_rewards)), ma, 'r-', linewidth=2)
        axes[0, 0].set_title('Episode Rewards')
        axes[0, 0].set_xlabel('Episode')
        axes[0, 0].set_ylabel('Reward')
        axes[0, 0].grid(alpha=0.3)
        
        # Lengths
        axes[0, 1].clear()
        axes[0, 1].plot(episode_lengths, alpha=0.6, color='green')
        axes[0, 1].set_title('Episode Lengths')
        axes[0, 1].set_xlabel('Episode')
        axes[0, 1].set_ylabel('Steps')
        axes[0, 1].grid(alpha=0.3)
        
        # Success rate
        axes[1, 0].clear()
        if len(success_rates) >= 50:
            sr = np.convolve(success_rates, np.ones(50)/50, mode='valid')
            axes[1, 0].plot(range(49, len(success_rates)), sr, 'purple', linewidth=2)
        axes[1, 0].set_title('Success Rate (50-ep avg)')
        axes[1, 0].set_xlabel('Episode')
        axes[1, 0].set_ylabel('Success Rate')
        axes[1, 0].set_ylim([0, 1])
        axes[1, 0].grid(alpha=0.3)
        
        # Reward distribution
        axes[1, 1].clear()
        recent = episode_rewards[-100:] if len(episode_rewards) >= 100 else episode_rewards
        axes[1, 1].hist(recent, bins=20, alpha=0.7, color='blue')
        axes[1, 1].set_title('Recent Reward Distribution')
        axes[1, 1].set_xlabel('Reward')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].grid(alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        print(f"Episode {episode + 1}/{CONFIG['n_episodes']}")
        print(f"  Reward: {episode_reward:.2f}")
        print(f"  Steps: {step}")
        print(f"  Avg Reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
        print(f"  Success Rate (last 100): {np.mean(success_rates[-100:]):.1%}")
    
    # Save checkpoint
    if (episode + 1) % CONFIG['save_interval'] == 0:
        ts = datetime.now().strftime('%Y%m%d_%H%M%S')
        path = f"{MODEL_DIR}/{CONFIG['agent_type']}_ep{episode+1}_{ts}.pt"
        
        checkpoint = {
            'episode': episode + 1,
            'config': CONFIG,
            'episode_rewards': episode_rewards,
            'episode_lengths': episode_lengths,
            'success_rates': success_rates
        }
        
        if CONFIG['agent_type'] == 'ppo':
            checkpoint['network_state_dict'] = agent.network.state_dict()
            checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
        else:  # DQN
            checkpoint['q_network_state_dict'] = agent.q_network.state_dict()
            checkpoint['target_network_state_dict'] = agent.target_network.state_dict()
            checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
        
        torch.save(checkpoint, path)
        print(f"\nüíæ Checkpoint: {path}\n")

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETE")
print("="*80)
print(f"Avg Reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
print(f"Success Rate (last 100): {np.mean(success_rates[-100:]):.1%}")
print(f"Best Reward: {max(episode_rewards):.2f}")

## 4Ô∏è‚É£ Save Final Model

In [None]:
# Save final model
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
final_path = f"{MODEL_DIR}/{CONFIG['agent_type']}_final_{ts}.pt"

checkpoint = {
    'episode': CONFIG['n_episodes'],
    'config': CONFIG,
    'episode_rewards': episode_rewards,
    'episode_lengths': episode_lengths,
    'success_rates': success_rates,
    'final_stats': {
        'avg_reward': np.mean(episode_rewards[-100:]),
        'success_rate': np.mean(success_rates[-100:]),
        'best_reward': max(episode_rewards)
    }
}

if CONFIG['agent_type'] == 'ppo':
    checkpoint['network_state_dict'] = agent.network.state_dict()
    checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
else:  # DQN
    checkpoint['q_network_state_dict'] = agent.q_network.state_dict()
    checkpoint['target_network_state_dict'] = agent.target_network.state_dict()
    checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()

torch.save(checkpoint, final_path)

print(f"‚úÖ Final model: {final_path}")
print(f"\nTo visualize locally:")
print(f"1. Download from Google Drive: {MODEL_DIR}")
print(f"2. Run: python visualize.py --model models/{CONFIG['agent_type']}_final_{ts}.pt")

## 5Ô∏è‚É£ Download Models

In [None]:
# List saved models
import os
models = [f for f in os.listdir(MODEL_DIR) if f.endswith('.pt')]
models.sort()

print(f"Saved Models ({len(models)}):")
print("="*60)
for i, model in enumerate(models, 1):
    path = os.path.join(MODEL_DIR, model)
    size = os.path.getsize(path) / (1024 * 1024)
    print(f"{i}. {model} ({size:.2f} MB)")

print(f"\nüì• Download from: {MODEL_DIR}")

In [None]:
# Zip and download all models
import shutil

zip_name = f"colony_models_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
zip_path = f"/content/{zip_name}"

shutil.make_archive(zip_path, 'zip', MODEL_DIR)
print(f"‚úÖ Zipped: {zip_path}.zip")

from google.colab import files
files.download(f"{zip_path}.zip")

## 6Ô∏è‚É£ Test Agent

In [None]:
# Quick test
test_rewards = []

for ep in range(5):
    observations = env.reset()
    done = False
    step = 0
    ep_reward = 0
    
    while not done and step < 200:
        actions = []
        for obs in observations:
            if CONFIG['agent_type'] == 'ppo':
                action, _, _ = agent.select_action(obs, training=False)
            else:
                action = agent.select_action(obs, training=False)
            actions.append(action)
        
        next_observations, rewards, dones, truncated, _ = env.step(actions)
        ep_reward += sum(rewards)
        observations = next_observations
        done = truncated or all(dones)
        step += 1
    
    test_rewards.append(ep_reward)
    print(f"Test {ep + 1}: Reward={ep_reward:.2f}, Steps={step}")

print(f"\nTest Avg: {np.mean(test_rewards):.2f} ¬± {np.std(test_rewards):.2f}")