# ðŸ¤– The Autonomous Colony - GPU Training

Train RL agents with GPU acceleration on Google Colab.

**Quick Start:**
1. Runtime â†’ Change runtime type â†’ **GPU (T4)**
2. Run all cells in order
3. Models auto-save to Google Drive
4. Download and visualize locally

## Setup

In [None]:
# Check GPU availability
!nvidia-smi

import torch
print(f"\nPyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Mount Google Drive for model persistence
from google.colab import drive
drive.mount('/content/drive')

import os
MODEL_DIR = '/content/drive/MyDrive/autonomous_colony_models'
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"âœ“ Models will be saved to: {MODEL_DIR}")

In [None]:
# Clone repository
!git clone https://github.com/ritikkumarv/autonomous-colony.git
%cd autonomous-colony

In [None]:
# Install dependencies
!pip install -q numpy matplotlib seaborn
print("âœ“ Dependencies installed")

## Training Configuration

In [None]:
# Configuration
CONFIG = {
    'n_agents': 2,
    'grid_size': 20,
    'n_episodes': 500,
    'max_steps': 200,
    'save_interval': 100,
    'agent_type': 'ppo',  # 'ppo' or 'dqn'
}

print("Training Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Initialize Agent & Environment

In [None]:
import sys
sys.path.insert(0, '/content/autonomous-colony')

import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from IPython import display

from src.environment import ColonyEnvironment
from src.agents import PPOAgent, DQNAgent

# Create environment
env = ColonyEnvironment(
    n_agents=CONFIG['n_agents'],
    grid_size=CONFIG['grid_size']
)

# Create agent
if CONFIG['agent_type'] == 'ppo':
    agent = PPOAgent(
        grid_shape=(7, 7, 5),
        state_dim=5,
        action_dim=9,
        learning_rate=3e-4,
        n_epochs=10,
        batch_size=64
    )
else:  # dqn
    agent = DQNAgent(
        grid_shape=(7, 7, 5),
        state_dim=5,
        action_dim=9,
        learning_rate=1e-3,
        batch_size=64,
        buffer_size=50000
    )

print(f"\n{'='*80}")
print(f"âœ“ {CONFIG['agent_type'].upper()} Agent initialized")
print(f"âœ“ Device: {agent.device}")
print(f"âœ“ Environment: {CONFIG['grid_size']}x{CONFIG['grid_size']} grid, {CONFIG['n_agents']} agents")
print(f"{'='*80}")

## Training Loop

In [None]:
# Training metrics
episode_rewards = []
episode_lengths = []
success_rates = []

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

print("\nStarting Training...\n")

for episode in range(CONFIG['n_episodes']):
    observations = env.reset()
    done = False
    step = 0
    episode_reward = 0
    
    # Episode loop
    while not done and step < CONFIG['max_steps']:
        actions = []
        log_probs = []
        values = []
        
        # Select actions for all agents
        for obs in observations:
            if CONFIG['agent_type'] == 'ppo':
                action, log_prob, value = agent.select_action(obs, training=True)
                log_probs.append(log_prob)
                values.append(value)
            else:  # DQN
                action = agent.select_action(obs, training=True)
            
            actions.append(action)
        
        # Step environment
        next_observations, rewards, dones, truncated, info = env.step(actions)
        
        # Store transitions
        if CONFIG['agent_type'] == 'ppo':
            # Store each agent's transition using PPO's store_transition method
            for i, (obs, action, reward, done_flag) in enumerate(zip(observations, actions, rewards, dones)):
                agent.store_transition(
                    state=obs,
                    action=action,
                    reward=reward,
                    log_prob=log_probs[i],
                    value=values[i],
                    done=done_flag
                )
        else:  # DQN
            for obs, action, reward, next_obs, done_flag in zip(
                observations, actions, rewards, next_observations, dones
            ):
                agent.memory.push(obs, action, reward, next_obs, done_flag)
        
        episode_reward += sum(rewards)
        observations = next_observations
        done = truncated[0] or all(dones)
        step += 1
    
    # Update agent
    if CONFIG['agent_type'] == 'ppo':
        if len(agent.rollout_buffer) >= agent.batch_size:
            loss = agent.update()
    else:  # DQN
        if len(agent.memory) >= agent.batch_size:
            loss = agent.update()
    
    # Track metrics
    episode_rewards.append(episode_reward)
    episode_lengths.append(step)
    success_rates.append(1 if episode_reward > 0 else 0)
    
    # Progress logging and visualization
    if (episode + 1) % 10 == 0:
        display.clear_output(wait=True)
        
        # Create plots
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        # Episode rewards
        axes[0, 0].plot(episode_rewards, alpha=0.3, color='blue', label='Episode Reward')
        if len(episode_rewards) >= 50:
            ma = np.convolve(episode_rewards, np.ones(50)/50, mode='valid')
            axes[0, 0].plot(range(49, len(episode_rewards)), ma, 'r-', linewidth=2, label='MA(50)')
        axes[0, 0].set_title('Episode Rewards')
        axes[0, 0].set_xlabel('Episode')
        axes[0, 0].set_ylabel('Total Reward')
        axes[0, 0].legend()
        axes[0, 0].grid(alpha=0.3)
        
        # Episode lengths
        axes[0, 1].plot(episode_lengths, alpha=0.6, color='green')
        axes[0, 1].set_title('Episode Lengths')
        axes[0, 1].set_xlabel('Episode')
        axes[0, 1].set_ylabel('Steps')
        axes[0, 1].grid(alpha=0.3)
        
        # Success rate
        axes[1, 0].clear()
        if len(success_rates) >= 50:
            sr = np.convolve(success_rates, np.ones(50)/50, mode='valid')
            axes[1, 0].plot(range(49, len(success_rates)), sr, 'purple', linewidth=2)
        axes[1, 0].set_title('Success Rate (50-ep moving avg)')
        axes[1, 0].set_xlabel('Episode')
        axes[1, 0].set_ylabel('Success Rate')
        axes[1, 0].set_ylim([0, 1])
        axes[1, 0].grid(alpha=0.3)
        
        # Reward distribution
        recent = episode_rewards[-100:] if len(episode_rewards) >= 100 else episode_rewards
        axes[1, 1].hist(recent, bins=20, alpha=0.7, color='blue', edgecolor='black')
        axes[1, 1].set_title('Recent Reward Distribution (last 100)')
        axes[1, 1].set_xlabel('Reward')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].grid(alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Print stats
        avg_reward = np.mean(episode_rewards[-100:])
        avg_length = np.mean(episode_lengths[-100:])
        success_rate = np.mean(success_rates[-100:])
        
        print(f"Episode {episode + 1}/{CONFIG['n_episodes']}")
        print(f"  Current Reward: {episode_reward:.2f}")
        print(f"  Avg Reward (last 100): {avg_reward:.2f}")
        print(f"  Avg Length (last 100): {avg_length:.1f}")
        print(f"  Success Rate (last 100): {success_rate:.1%}")
    
    # Save checkpoint
    if (episode + 1) % CONFIG['save_interval'] == 0:
        checkpoint_path = f"{MODEL_DIR}/{CONFIG['agent_type']}_ep{episode+1}_{timestamp}.pt"
        
        checkpoint = {
            'episode': episode + 1,
            'config': CONFIG,
            'episode_rewards': episode_rewards,
            'episode_lengths': episode_lengths,
            'success_rates': success_rates,
        }
        
        if CONFIG['agent_type'] == 'ppo':
            checkpoint['network_state_dict'] = agent.network.state_dict()
            checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
        else:  # DQN
            checkpoint['q_network_state_dict'] = agent.q_network.state_dict()
            checkpoint['target_network_state_dict'] = agent.target_network.state_dict()
            checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
        
        torch.save(checkpoint, checkpoint_path)
        print(f"\nðŸ’¾ Checkpoint saved: {checkpoint_path}\n")

print("\n" + "="*80)
print("âœ… TRAINING COMPLETE!")
print("="*80)
print(f"Final Avg Reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
print(f"Final Success Rate (last 100): {np.mean(success_rates[-100:]):.1%}")
print(f"Best Episode Reward: {max(episode_rewards):.2f}")
print(f"Models saved to: {MODEL_DIR}")
print("="*80)

## Save Final Model

In [None]:
# Save final trained model
final_path = f"{MODEL_DIR}/{CONFIG['agent_type']}_final_{timestamp}.pt"

checkpoint = {
    'episode': CONFIG['n_episodes'],
    'config': CONFIG,
    'episode_rewards': episode_rewards,
    'episode_lengths': episode_lengths,
    'success_rates': success_rates,
    'final_stats': {
        'avg_reward': np.mean(episode_rewards[-100:]),
        'success_rate': np.mean(success_rates[-100:]),
        'best_reward': max(episode_rewards)
    }
}

if CONFIG['agent_type'] == 'ppo':
    checkpoint['network_state_dict'] = agent.network.state_dict()
    checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
else:  # DQN
    checkpoint['q_network_state_dict'] = agent.q_network.state_dict()
    checkpoint['target_network_state_dict'] = agent.target_network.state_dict()
    checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()

torch.save(checkpoint, final_path)

print(f"âœ… Final model saved: {final_path}")
print(f"\nTo visualize locally:")
print(f"1. Download model from Google Drive: {MODEL_DIR}")
print(f"2. Place in local models/ directory")
print(f"3. Run: python visualize.py --model models/{CONFIG['agent_type']}_final_{timestamp}.pt --episodes 10")

## Download Models

In [None]:
# List all saved models
models = [f for f in os.listdir(MODEL_DIR) if f.endswith('.pt')]
models.sort()

print(f"Saved Models ({len(models)}):")
print("="*80)
for i, model in enumerate(models, 1):
    path = os.path.join(MODEL_DIR, model)
    size = os.path.getsize(path) / (1024 * 1024)
    print(f"{i}. {model} ({size:.2f} MB)")

print(f"\nðŸ“¥ Download from: {MODEL_DIR}")

In [None]:
# Create zip file for easy download
import shutil
from google.colab import files

zip_name = f"colony_models_{timestamp}"
zip_path = f"/content/{zip_name}"

shutil.make_archive(zip_path, 'zip', MODEL_DIR)
print(f"âœ… Created: {zip_path}.zip")
print("\nDownloading...")

files.download(f"{zip_path}.zip")

## Test Trained Agent

In [None]:
# Quick test of trained agent
print("Testing trained agent...\n")

test_rewards = []
test_lengths = []

for test_ep in range(5):
    observations = env.reset()
    done = False
    step = 0
    ep_reward = 0
    
    while not done and step < 200:
        actions = []
        
        for obs in observations:
            if CONFIG['agent_type'] == 'ppo':
                action, _, _ = agent.select_action(obs, training=False)
            else:  # DQN
                action = agent.select_action(obs, training=False)
            actions.append(action)
        
        next_observations, rewards, dones, truncated, _ = env.step(actions)
        ep_reward += sum(rewards)
        observations = next_observations
        done = truncated[0] or all(dones)
        step += 1
    
    test_rewards.append(ep_reward)
    test_lengths.append(step)
    print(f"Test Episode {test_ep + 1}: Reward={ep_reward:.2f}, Steps={step}")

print(f"\nTest Performance:")
print(f"  Avg Reward: {np.mean(test_rewards):.2f} Â± {np.std(test_rewards):.2f}")
print(f"  Avg Length: {np.mean(test_lengths):.1f} Â± {np.std(test_lengths):.1f}")