# ü§ñ The Autonomous Colony - GPU Training on Google Colab

This notebook trains RL agents with GPU/TPU acceleration on Google Colab.

**Features:**
- üöÄ GPU/TPU acceleration (T4, A100, V100)
- üìä Multiple agents (PPO, DQN, MAPPO)
- üß† Advanced RL techniques (Curiosity, Hierarchical, World Models)
- üíæ Automatic model saving to Google Drive
- üìà Real-time training visualization

**Setup Instructions:**
1. Runtime ‚Üí Change runtime type ‚Üí GPU (T4 recommended)
2. Run all cells in order
3. Models will be saved to your Google Drive
4. Download trained models for local visualization

## 1Ô∏è‚É£ Setup Environment

In [None]:
# Check GPU availability
!nvidia-smi

import torch
print(f"\n{'='*80}")
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"{'='*80}")

In [None]:
# Mount Google Drive for model persistence
from google.colab import drive
drive.mount('/content/drive')

# Create directory for models
import os
MODEL_DIR = '/content/drive/MyDrive/autonomous_colony_models'
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"‚úì Models will be saved to: {MODEL_DIR}")

In [None]:
# Clone the repository
!git clone https://github.com/ritikkumarv/autonomous-colony.git
%cd autonomous-colony

In [None]:
# Install dependencies
!pip install -q torch torchvision numpy matplotlib seaborn
print("‚úì Dependencies installed")

## 2Ô∏è‚É£ Training Configuration

In [None]:
# Training Configuration
TRAINING_CONFIG = {
    # Environment
    'n_agents': 4,
    'grid_size': 30,
    
    # Training
    'n_episodes': 1000,  # More episodes with GPU
    'max_steps': 500,
    'save_interval': 100,
    
    # Agent selection
    'agent_type': 'ppo',  # 'ppo', 'dqn', 'mappo'
    
    # Advanced features (optional)
    'use_curiosity': True,
    'curiosity_type': 'icm',  # 'icm' or 'rnd'
    'use_hierarchical': False,
    'use_world_model': False,
    'use_curriculum': True,
}

print("Training Configuration:")
for key, value in TRAINING_CONFIG.items():
    print(f"  {key}: {value}")

## 3Ô∏è‚É£ Train Agent

In [None]:
import sys
sys.path.insert(0, '/content/autonomous-colony')

import torch
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from IPython.display import clear_output

from src.environment import ColonyEnvironment
from src.agents import PPOAgent, DQNAgent
from src.multiagent import MultiAgentPPO
from src.advanced import ICM, RND, CurriculumScheduler

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}\n")

# Create environment
env = ColonyEnvironment(
    n_agents=TRAINING_CONFIG['n_agents'],
    grid_size=TRAINING_CONFIG['grid_size']
)

# Create agent based on configuration
agent_type = TRAINING_CONFIG['agent_type'].lower()

if agent_type == 'ppo':
    agent = PPOAgent(
        grid_shape=(7, 7, 5),
        state_dim=5,
        action_dim=9,
        learning_rate=3e-4,
        n_epochs=10,
        batch_size=128
    )
    print("‚úì Created PPO Agent")
    
elif agent_type == 'dqn':
    agent = DQNAgent(
        grid_shape=(7, 7, 5),
        state_dim=5,
        action_dim=9,
        learning_rate=1e-3,
        batch_size=128,
        buffer_size=100000
    )
    print("‚úì Created DQN Agent")
    
elif agent_type == 'mappo':
    agent = MultiAgentPPO(
        n_agents=TRAINING_CONFIG['n_agents'],
        grid_shape=(7, 7, 5),
        state_dim=5,
        action_dim=9,
        use_communication=True
    )
    print("‚úì Created Multi-Agent PPO")

# Add curiosity module if enabled
curiosity = None
if TRAINING_CONFIG['use_curiosity']:
    if TRAINING_CONFIG['curiosity_type'] == 'icm':
        curiosity = ICM(
            grid_shape=(7, 7, 5),
            state_dim=5,
            action_dim=9
        )
        print("‚úì Added ICM Curiosity Module")
    elif TRAINING_CONFIG['curiosity_type'] == 'rnd':
        curiosity = RND(
            grid_shape=(7, 7, 5),
            state_dim=5
        )
        print("‚úì Added RND Curiosity Module")

# Add curriculum learning if enabled
curriculum = None
if TRAINING_CONFIG['use_curriculum']:
    curriculum = CurriculumScheduler(
        initial_difficulty=0.3,
        target_difficulty=1.0,
        adaptation_rate=0.1
    )
    print("‚úì Added Curriculum Learning")

print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80)

In [None]:
# Training loop with live visualization
episode_rewards = []
episode_lengths = []
success_rates = []
curiosity_bonuses = []

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

for episode in range(TRAINING_CONFIG['n_episodes']):
    observations = env.reset()
    done = False
    step = 0
    episode_reward = 0
    episode_curiosity = 0
    
    while not done and step < TRAINING_CONFIG['max_steps']:
        # Select actions
        if agent_type == 'mappo':
            actions, log_probs, values = agent.select_actions(observations, training=True)
        else:
            actions = []
            log_probs = []
            values = []
            
            for obs in observations:
                if agent_type == 'ppo':
                    action, log_prob, value = agent.select_action(obs, training=True)
                else:  # DQN
                    action = agent.select_action(obs, training=True)
                    log_prob = 0
                    value = 0
                    
                actions.append(action)
                log_probs.append(log_prob)
                values.append(value)
        
        # Step environment
        next_observations, rewards, dones, truncated, info = env.step(actions)
        
        # Add curiosity bonus
        if curiosity:
            for i, (obs, action, next_obs) in enumerate(zip(observations, actions, next_observations)):
                bonus = curiosity.compute_bonus(obs, action, next_obs)
                rewards[i] += bonus
                episode_curiosity += bonus
        
        # Store transitions
        if agent_type == 'mappo':
            agent.store_transition(observations, actions, rewards, next_observations, dones, log_probs, values)
        elif agent_type == 'ppo':
            for i, (obs, action, reward, next_obs, log_prob, value) in enumerate(
                zip(observations, actions, rewards, next_observations, log_probs, values)
            ):
                agent.rollout_buffer.append({
                    'observation': obs,
                    'action': action,
                    'reward': reward,
                    'next_observation': next_obs,
                    'done': dones[i],
                    'log_prob': log_prob,
                    'value': value
                })
        else:  # DQN
            for i, (obs, action, reward, next_obs) in enumerate(
                zip(observations, actions, rewards, next_observations)
            ):
                agent.memory.push(obs, action, reward, next_obs, dones[i])
        
        episode_reward += sum(rewards)
        observations = next_observations
        done = truncated or all(dones)
        step += 1
    
    # Update agent
    if agent_type == 'ppo':
        if len(agent.rollout_buffer) > agent.batch_size:
            losses = agent.update()
    elif agent_type == 'dqn':
        if len(agent.memory) > agent.batch_size:
            loss = agent.update()
    elif agent_type == 'mappo':
        if agent.is_ready_to_update():
            losses = agent.update()
    
    # Update curiosity
    if curiosity and hasattr(curiosity, 'update'):
        curiosity.update()
    
    # Update curriculum
    if curriculum:
        curriculum.update(episode_reward, step)
    
    # Track metrics
    episode_rewards.append(episode_reward)
    episode_lengths.append(step)
    success = episode_reward > 0
    success_rates.append(1 if success else 0)
    if curiosity:
        curiosity_bonuses.append(episode_curiosity)
    
    # Live plotting every 10 episodes
    if (episode + 1) % 10 == 0:
        clear_output(wait=True)
        
        # Plot rewards
        axes[0, 0].clear()
        axes[0, 0].plot(episode_rewards, alpha=0.3, color='blue')
        if len(episode_rewards) >= 50:
            moving_avg = np.convolve(episode_rewards, np.ones(50)/50, mode='valid')
            axes[0, 0].plot(range(49, len(episode_rewards)), moving_avg, color='red', linewidth=2)
        axes[0, 0].set_title('Episode Rewards')
        axes[0, 0].set_xlabel('Episode')
        axes[0, 0].set_ylabel('Total Reward')
        axes[0, 0].grid(True, alpha=0.3)
        
        # Plot episode lengths
        axes[0, 1].clear()
        axes[0, 1].plot(episode_lengths, alpha=0.6, color='green')
        axes[0, 1].set_title('Episode Lengths')
        axes[0, 1].set_xlabel('Episode')
        axes[0, 1].set_ylabel('Steps')
        axes[0, 1].grid(True, alpha=0.3)
        
        # Plot success rate
        axes[1, 0].clear()
        if len(success_rates) >= 100:
            success_avg = np.convolve(success_rates, np.ones(100)/100, mode='valid')
            axes[1, 0].plot(range(99, len(success_rates)), success_avg, color='purple', linewidth=2)
        axes[1, 0].set_title('Success Rate (100-ep moving avg)')
        axes[1, 0].set_xlabel('Episode')
        axes[1, 0].set_ylabel('Success Rate')
        axes[1, 0].set_ylim([0, 1])
        axes[1, 0].grid(True, alpha=0.3)
        
        # Plot curiosity bonuses
        axes[1, 1].clear()
        if curiosity and len(curiosity_bonuses) > 0:
            axes[1, 1].plot(curiosity_bonuses, alpha=0.5, color='orange')
            axes[1, 1].set_title('Curiosity Bonuses')
        else:
            recent_rewards = episode_rewards[-100:] if len(episode_rewards) >= 100 else episode_rewards
            axes[1, 1].hist(recent_rewards, bins=20, alpha=0.7, color='blue')
            axes[1, 1].set_title('Recent Reward Distribution')
        axes[1, 1].set_xlabel('Episode' if curiosity else 'Reward')
        axes[1, 1].set_ylabel('Bonus' if curiosity else 'Frequency')
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        print(f"Episode {episode + 1}/{TRAINING_CONFIG['n_episodes']}")
        print(f"  Reward: {episode_reward:.2f}")
        print(f"  Steps: {step}")
        print(f"  Avg Reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
        print(f"  Success Rate (last 100): {np.mean(success_rates[-100:]):.1%}")
        if curriculum:
            print(f"  Curriculum Difficulty: {curriculum.current_difficulty:.2f}")
    
    # Save checkpoint
    if (episode + 1) % TRAINING_CONFIG['save_interval'] == 0:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        save_path = f"{MODEL_DIR}/{agent_type}_ep{episode+1}_{timestamp}.pt"
        
        checkpoint = {
            'episode': episode + 1,
            'config': TRAINING_CONFIG,
            'episode_rewards': episode_rewards,
            'episode_lengths': episode_lengths,
            'success_rates': success_rates
        }
        
        if agent_type == 'ppo':
            checkpoint['network_state_dict'] = agent.network.state_dict()
            checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
        elif agent_type == 'dqn':
            checkpoint['q_network_state_dict'] = agent.q_network.state_dict()
            checkpoint['target_network_state_dict'] = agent.target_network.state_dict()
            checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
        elif agent_type == 'mappo':
            checkpoint['actor_critic_state_dict'] = agent.actor_critic.state_dict()
            checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
        
        torch.save(checkpoint, save_path)
        print(f"\n‚úì Checkpoint saved: {save_path}\n")

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETE!")
print("="*80)
print(f"Final Average Reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
print(f"Final Success Rate (last 100): {np.mean(success_rates[-100:]):.1%}")
print(f"Best Episode Reward: {max(episode_rewards):.2f}")
print(f"\nModels saved to: {MODEL_DIR}")

## 4Ô∏è‚É£ Save Final Model

In [None]:
# Save final trained model
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
final_path = f"{MODEL_DIR}/{agent_type}_final_{timestamp}.pt"

checkpoint = {
    'episode': TRAINING_CONFIG['n_episodes'],
    'config': TRAINING_CONFIG,
    'episode_rewards': episode_rewards,
    'episode_lengths': episode_lengths,
    'success_rates': success_rates,
    'final_stats': {
        'avg_reward': np.mean(episode_rewards[-100:]),
        'success_rate': np.mean(success_rates[-100:]),
        'best_reward': max(episode_rewards)
    }
}

if agent_type == 'ppo':
    checkpoint['network_state_dict'] = agent.network.state_dict()
    checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
elif agent_type == 'dqn':
    checkpoint['q_network_state_dict'] = agent.q_network.state_dict()
    checkpoint['target_network_state_dict'] = agent.target_network.state_dict()
    checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()
elif agent_type == 'mappo':
    checkpoint['actor_critic_state_dict'] = agent.actor_critic.state_dict()
    checkpoint['optimizer_state_dict'] = agent.optimizer.state_dict()

torch.save(checkpoint, final_path)

print(f"‚úÖ Final model saved to: {final_path}")
print(f"\nTo use this model locally:")
print(f"1. Download from Google Drive: {MODEL_DIR}")
print(f"2. Place in your local models/ directory")
print(f"3. Run: python visualize.py --model models/{agent_type}_final_{timestamp}.pt")

## 5Ô∏è‚É£ Download Models

In [None]:
# List all saved models
import os
models = [f for f in os.listdir(MODEL_DIR) if f.endswith('.pt')]
models.sort()

print(f"Saved Models ({len(models)}):")
print("="*80)
for i, model in enumerate(models, 1):
    path = os.path.join(MODEL_DIR, model)
    size = os.path.getsize(path) / (1024 * 1024)  # MB
    print(f"{i}. {model} ({size:.2f} MB)")

print(f"\nüíæ Access models at: {MODEL_DIR}")
print("üì• Download from Google Drive to use locally")

In [None]:
# Optionally zip all models for easy download
import shutil
from datetime import datetime

zip_name = f"autonomous_colony_models_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
zip_path = f"/content/{zip_name}"

shutil.make_archive(zip_path, 'zip', MODEL_DIR)

print(f"‚úÖ All models zipped to: {zip_path}.zip")
print(f"üì• Download this file to get all trained models")

from google.colab import files
files.download(f"{zip_path}.zip")

## 6Ô∏è‚É£ Quick Visualization (Optional)

In [None]:
# Quick test of trained agent
print("Testing trained agent...\n")

test_env = ColonyEnvironment(n_agents=TRAINING_CONFIG['n_agents'], grid_size=TRAINING_CONFIG['grid_size'])
test_rewards = []

for ep in range(5):
    observations = test_env.reset()
    done = False
    step = 0
    episode_reward = 0
    
    while not done and step < 200:
        if agent_type == 'mappo':
            actions, _, _ = agent.select_actions(observations, training=False)
        else:
            actions = []
            for obs in observations:
                if agent_type == 'ppo':
                    action, _, _ = agent.select_action(obs, training=False)
                else:
                    action = agent.select_action(obs, training=False)
                actions.append(action)
        
        next_observations, rewards, dones, truncated, _ = test_env.step(actions)
        episode_reward += sum(rewards)
        observations = next_observations
        done = truncated or all(dones)
        step += 1
    
    test_rewards.append(episode_reward)
    print(f"Test Episode {ep + 1}: Reward = {episode_reward:.2f}, Steps = {step}")

print(f"\nAverage Test Reward: {np.mean(test_rewards):.2f} ¬± {np.std(test_rewards):.2f}")