In [1]:
import os
import sys
import numpy as np
import torch
import wandb
from pathlib import Path
import time
from typing import Dict, List, Tuple, Optional
import matplotlib as plt

from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE

from MPC.MPC_Agent import MultiAgentMPC, MPCConfig

# Utility functions

In [2]:
def get_agent_obs(obs, agent, cam_key=1, vec_keys=[0, 2]):
    """
    Extract observation data for an agent from the environment.
    """
    if agent not in obs:
        raise KeyError(f"Agent {agent!r} not found in observations")
    
    data = obs[agent]
    if isinstance(data, dict) and "observation" in data:
        data = data["observation"]
    
    # Extract camera and vector observations
    if isinstance(data, dict) and ("camera_obs" in data and "vector_obs" in data):
        cam = np.asarray(data["camera_obs"])
        vec = np.asarray(data["vector_obs"])
        if vec.ndim > 1:
            vec = vec.reshape(-1)
    else:
        cam = np.asarray(data[cam_key])
        v0 = np.asarray(data[vec_keys[0]]).reshape(-1)
        v1 = np.asarray(data[vec_keys[1]]).reshape(-1)
        vec = np.concatenate([v0, v1], axis=0)
    
    # Process camera
    if cam.ndim != 3:
        raise AssertionError(f"Camera must be 3D, got shape {cam.shape}")
    
    if cam.shape[-1] in (1, 3, 4):
        cam = np.transpose(cam, (2, 0, 1))
    
    cam = cam.astype(np.float32, copy=False)
    if cam.max() > 1.5:
        cam = cam / 255.0
    
    vec = vec.astype(np.float32, copy=False)
    
    return cam, vec


def relocate_agents(env):
    """Get sorted list of agent IDs"""
    return sorted(list(env.agents))


def extract_state_from_obs(vector_obs: np.ndarray, state_dim: int = 12) -> np.ndarray:
    """
    Extract state representation for MPC from vector observations.
    
    Assuming vector_obs contains:
    - Position (3D)
    - Velocity (3D)
    - Orientation (3D)
    - Angular velocity (3D)
    - Additional features...
    """
    # Take first state_dim elements
    state = vector_obs[:state_dim]
    
    # Ensure proper dimensionality
    if len(state) < state_dim:
        state = np.pad(state, (0, state_dim - len(state)))
    
    return state

# Train

In [3]:
def train_dynamics_model(mpc_agent, env, config, args):
    """
    Train MPC dynamics model through environment interaction.
    
    Phase 1: Random exploration to collect initial data
    Phase 2: MPC-guided exploration with dynamics learning
    Phase 3: Evaluation and comparison
    """
    print("="*60)
    print("MPC Dynamics Training")
    print("="*60)

    episode_rewards = []
    episode_lengths = []
    solve_times = []
    dynamics_losses = []

    current_episode_reward = np.zeros(mpc_agent.num_agents)
    current_episode_length = 0

    obs = env.reset()
    agents = relocate_agents(env)
    num_agents = len(agents)

    cam_shape = env.observation_space(agents[0])[1].shape
    vec_dim = (env.observation_space(agents[0])[0].shape[0] + 
            env.observation_space(agents[0])[2].shape[0])
    vec_shape = (vec_dim,)
    action_shape = env.action_space(agents[0]).shape

    blank_cam = np.zeros(cam_shape, dtype=np.float32)
    blank_vec = np.zeros(vec_shape, dtype=np.float32)

    phases = [
        ("Random Exploration", 0, args.exploration_steps),
        ("MPC Training", args.exploration_steps, args.exploration_steps + args.training_steps),
        ("MPC Refinement", args.exploration_steps + args.training_steps, args.max_steps)
    ]

    total_steps = 0

    for phase_name, start_step, end_step in phases:
        print(f"\n{'='*40}")
        print(f"Phase: {phase_name}")
        print(f"Steps: {start_step:,} - {end_step:,}")
        print(f"{'='*40}\n")

        while total_steps < end_step:
            if not obs or len(obs) == 0:
                obs = env.reset()
                agents = relocate_agents(env)
                current_episode_reward = np.zeros(num_agents)
                current_episode_length = 0

            camera_obs = np.zeros((num_agents, *cam_shape), dtype=np.float32)
            vector_obs = np.zeros((num_agents, *vec_shape), dtype=np.float32)
            states = np.zeros((num_agents, config.state_dim), dtype=np.float32)

            for i, agent_id in enumerate(agents):
                if agent_id in obs:
                    cam, vec = get_agent_obs(obs, agent_id)
                else:
                    cam, vec = blank_cam, blank_vec
                camera_obs[i] = cam
                vector_obs[i] = vec
                states[i] = extract_state_from_obs(vec, config.state_dim)

            if phase_name == "Random Exploration":
                actions = np.random.uniform(-1, 1, (num_agents, config.action_dim))
            else:
                goals = generate_goals(vector_obs)
                # Note: We pass full vector_obs, agent extracts kinematics internally
                actions, info = mpc_agent.get_actions(vector_obs, goals)

            action_dict = {agent_id: action for agent_id, action in zip(agents, actions)}
            next_obs, reward_dict, done_dict, info_dict = env.step(action_dict)

            next_states = np.zeros((num_agents, config.state_dim), dtype=np.float32)

            for i, agent_id in enumerate(agents):
                if agent_id in next_obs:
                    _, next_vec = get_agent_obs(next_obs, agent_id)
                    next_states[i] = extract_state_from_obs(next_vec, config.state_dim)
            
            transitions = []
            transition_dict = {} 
            for i in range(num_agents):
                transition_dict[i] = {
                    'state': vector_obs[i],      # Pass full obs
                    'action': actions[i],
                    'next_state': next_vec[i] # Need to get next_vec from step
                }
            transitions.append(transition_dict)

            if total_steps % args.dynamics_update_freq == 0 and total_steps > 0:
                update_stats = mpc_agent.update_dynamics_models(transitions)
                if update_stats:
                    avg_loss = np.mean([s['dynamics_loss'] for s in update_stats])
                    dynamics_losses.append(avg_loss)
            else:
                # Just store transitions
                mpc_agent.update_dynamics_models(transitions)

            rewards = np.array([reward_dict.get(a, 0.0) for a in agents])
            dones = np.array([done_dict.get(a, False) for a in agents])
            
            current_episode_reward += rewards
            current_episode_length += 1
            total_steps += 1

            if any(dones) or all(done_dict.values()):
                mean_reward = current_episode_reward.mean()
                episode_rewards.append(mean_reward)
                episode_lengths.append(current_episode_length)

                obs = env.reset()
                agents = relocate_agents(env)
                current_episode_reward = np.zeros(num_agents)
                current_episode_length = 0
            else:
                obs = next_obs

            if total_steps % args.log_every == 0:
                if len(episode_rewards) > 0:
                    mean_reward = np.mean(episode_rewards[-100:])
                    mean_length = np.mean(episode_lengths[-100:])
                else:
                    mean_reward = 0.0
                    mean_length = 0.0

            print(f"\nStep {total_steps:,} | Phase: {phase_name}")
            print(f"  Reward (100ep): {mean_reward:.2f}")
            print(f"  Episode length: {mean_length:.0f}")
            
            if len(solve_times) > 0:
                print(f"  Avg solve time: {np.mean(solve_times[-100:])*1000:.1f} ms")
            
            if len(dynamics_losses) > 0:
                print(f"  Dynamics loss:  {dynamics_losses[-1]:.4f}")

            if args.use_wandb:
                log_dict = {
                    'train/reward_mean': mean_reward,
                        'train/episode_length': mean_length,
                        'train/phase': phases.index((phase_name, start_step, end_step)),
                        'train/total_steps': total_steps,
                }
                
                if len(solve_times) > 0:
                    log_dict['mpc/solve_time_ms'] = np.mean(solve_times[-100:]) * 1000
                
                if len(dynamics_losses) > 0:
                    log_dict['mpc/dynamics_loss'] = dynamics_losses[-1]
                
                wandb.log(log_dict, step=total_steps)

    return episode_rewards, solve_times, dynamics_losses

def generate_goals(vector_obs):
    """
    Simple goal logic: Move 10m forward relative to current position.
    """
    positions = vector_obs[:, :3]
    goals = positions.copy()
    goals[:, 0] += 10.0 # Move 10 units in X
    return goals

def evaluate_mpc(mpc_agent, env, num_episodes=100):
    """
    Evaluate trained MPC agent.
    """
    print("\n" + "="*60)
    print("MPC Evaluation")
    print("="*60)
    
    episode_rewards = []
    episode_lengths = []
    solve_times_all = []
    successes = 0
    
    for episode in range(num_episodes):
        obs = env.reset()
        agents = relocate_agents(env)
        num_agents = len(agents)
        
        # Get shapes
        cam_shape = env.observation_space(agents[0])[1].shape
        vec_shape = (env.observation_space(agents[0])[0].shape[0] + 
                     env.observation_space(agents[0])[2].shape[0],)
        
        episode_reward = 0
        episode_length = 0
        done = False
        
        while not done and episode_length < 500:
            # Collect observations
            camera_obs = []
            vector_obs = []
            
            for agent_id in agents:
                if agent_id in obs:
                    cam, vec = get_agent_obs(obs, agent_id)
                else:
                    cam = np.zeros(cam_shape, dtype=np.float32)
                    vec = np.zeros(vec_shape, dtype=np.float32)
                camera_obs.append(cam)
                vector_obs.append(vec)
            
            camera_obs = np.array(camera_obs)
            vector_obs = np.array(vector_obs)
            
            # Get MPC actions
            states = np.array([extract_state_from_obs(v, mpc_agent.config.state_dim) 
                              for v in vector_obs])
            goals = generate_goals(states, mpc_agent.config)
            
            actions, info = mpc_agent.get_action(
                camera_obs,
                vector_obs,
                goals=goals
            )
            
            solve_times_all.append(info['solve_time'])
            
            # Step environment
            action_dict = {agent_id: action for agent_id, action in zip(agents, actions)}
            next_obs, reward_dict, done_dict, _ = env.step(action_dict)
            
            # Accumulate rewards
            rewards = np.array([reward_dict.get(a, 0.0) for a in agents])
            episode_reward += rewards.sum()
            episode_length += 1
            
            # Check done
            done = all(done_dict.values())
            obs = next_obs
        
        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)
        
        if episode_reward > 40:  # Success threshold
            successes += 1
        
        if (episode + 1) % 10 == 0:
            print(f"Episode {episode+1}/{num_episodes}: "
                  f"Reward={episode_reward:.2f}, "
                  f"Length={episode_length}, "
                  f"Solve={np.mean(solve_times_all[-episode_length:])*1000:.1f}ms")
            
    # Print results
    print("\n" + "="*60)
    print("Evaluation Results")
    print("="*60)
    print(f"Mean Reward:     {np.mean(episode_rewards):.2f} ± {np.std(episode_rewards):.2f}")
    print(f"Success Rate:    {successes/num_episodes:.1%}")
    print(f"Mean Length:     {np.mean(episode_lengths):.1f}")
    print(f"Avg Solve Time:  {np.mean(solve_times_all)*1000:.1f} ms")
    print("="*60)
    
    return {
        'rewards': episode_rewards,
        'success_rate': successes / num_episodes,
        'solve_times': solve_times_all,
    }

In [4]:
class Args:
    def __init__(self,d):
        self.__dict__=d

params = {
    'env_path': 'Env/FinalLevel/DroneFlightv1',
    'exploration_steps': 5000,
    'training_steps': 20000,
    'max_steps': 50000,
    'log_every': 1000,
    'dynamics_update_freq': 100,
    'use_wandb': True
}

args = Args(params)

In [5]:
print("="*60)
print("MPC Training for Multi-Agent Drone Navigation")
print("="*60)

print("\nLoading Unity environment...")
env = UE(file_name=args.env_path)
env = UPZBE(env)

# Get environment info
obs = env.reset()
agents = relocate_agents(env)
num_agents = len(agents)

cam_shape = env.observation_space(agents[0])[1].shape
vec_dim = (env.observation_space(agents[0])[0].shape[0] + 
            env.observation_space(agents[0])[2].shape[0])
vec_shape = (vec_dim,)
action_dim = env.action_space(agents[0]).shape[0]

print(f"✓ Environment loaded")
print(f"  Agents: {num_agents}")
print(f"  Camera shape: {cam_shape}")
print(f"  Vector dim: {vec_dim}")
print(f"  Action dim: {action_dim}")

# Create MPC configuration
config = MPCConfig(
    horizon=20,
    dynamics_lr=1e-3,
    action_dim=action_dim,
)


# Initialize MPC
print("\nInitializing MPC...")
mpc_agent = MultiAgentMPC(
    num_agents=num_agents,
    config=config
)

print(f"✓ MPC initialized")
print(f"  Horizon: {config.horizon}")
print(f"  Dynamics LR: {config.dynamics_lr}")

# Train dynamics model
print("\nStarting training...")
episode_rewards, solve_times, dynamics_losses = train_dynamics_model(
    mpc_agent, env, config, args
)

# Save models
save_dir = Path(args.save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

save_path = save_dir / "mpc_final.pth"
mpc_agent.save(save_path)
print(f"\n✓ MPC models saved to {save_path}")

# Evaluate
print("\nEvaluating MPC...")
eval_results = evaluate_mpc(mpc_agent, env, args.eval_episodes)

# Close environment
env.close()

if args.use_wandb:
    wandb.finish()

print("\n" + "="*60)
print("Training Complete!")
print("="*60)

MPC Training for Multi-Agent Drone Navigation

Loading Unity environment...
✓ Environment loaded
  Agents: 4
  Camera shape: (4, 84, 84)
  Vector dim: 92
  Action dim: 4

Initializing MPC...
✓ MPC initialized
  Horizon: 20
  Dynamics LR: 0.001

Starting training...
MPC Dynamics Training

Phase: Random Exploration
Steps: 0 - 5,000



AttributeError: 'MPCConfig' object has no attribute 'state_dim'