# MAPPO Training for Multi-Agent Drone Navigation

This notebook trains a MAPPO agent on your Unity ML-Agents drone environment.

## 1. Imports and Setup

In [1]:
import os
import sys
import random
import datetime as dt
import numpy as np
import torch
import wandb
from pathlib import Path

# Unity ML-Agents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE

# MAPPO components
from MAPPO.mappo_agent import MAPPOAgent
from MAPPO.rollout_buffer import RolloutBuffer
# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.set_float32_matmul_precision("high")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Using device: cuda
GPU: NVIDIA GeForce RTX 4080 SUPER
GPU Memory: 17.17 GB


In [2]:
# Cell 1: Suppress Warnings
import warnings
import logging
warnings.filterwarnings('ignore', module='mlagents_envs')
logging.getLogger('mlagents_envs').setLevel(logging.ERROR)
print("✓ Unity warnings suppressed")



## 2. Configuration

In [3]:
# MAPPO Hyperparameters
config = {
    # Learning
    'learning_rate': 3e-4,           # Slightly higher for faster adaptation
    'clip_param': 0.2,
    'value_loss_coef': 0.5,
    'entropy_coef': 0.1,            # ← CHANGED: Higher for more exploration
    'curiosity_coef': 0.1,          # ← CHANGED: Higher for curiosity
    'max_grad_norm': 5.0,
    
    # GAE
    'gamma': 0.99,
    'gae_lambda': 0.95,
    
    # Training
    'rollout_length': 2048,
    'num_minibatches': 8,
    'ppo_epochs': 3,
    'max_steps': 3_000_000,
    'reward_clip': 10.0,
}

# Training settings
SAVE_DIR = Path("./saved_models_mappo_no_curriculum")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

LOG_EVERY = 1          # Log every N updates
SAVE_EVERY = 10        # Save checkpoint every N updates

USE_WANDB = True       # Enable Weights & Biases logging
NORMALIZE_REWARDS = False  # Enable reward normalization (optional)

print("Configuration:")
print(f"  Max steps: {config['max_steps']:,}")
print(f"  Rollout length: {config['rollout_length']}")
print(f"  Learning rate: {config['learning_rate']}")
print(f"  calcuEntropy coefficient: {config['entropy_coef']}")
print(f"  Curiosity coefficient: {config['curiosity_coef']}")
print(f"  Save directory: {SAVE_DIR}")

Configuration:
  Max steps: 3,000,000
  Rollout length: 2048
  Learning rate: 0.0003
  calcuEntropy coefficient: 0.1
  Curiosity coefficient: 0.1
  Save directory: saved_models_mappo_no_curriculum


## 3. Utility Functions

In [4]:
def get_agent_obs(obs, agent, cam_key=1, vec_keys=[0, 2]):
    """
    Extract observation data for an agent.
    Returns camera (CHW, float32, [0,1]) and vector (1D, float32)
    """
    if agent not in obs:
        raise KeyError(f"Agent {agent!r} not found in observations")
    
    data = obs[agent]
    if isinstance(data, dict) and "observation" in data:
        data = data["observation"]
    
    # Extract camera and vector observations
    if isinstance(data, dict) and ("camera_obs" in data and "vector_obs" in data):
        cam = np.asarray(data["camera_obs"])
        vec = np.asarray(data["vector_obs"])
        if vec.ndim > 1:
            vec = vec.reshape(-1)
    else:
        # Indexed access
        cam = np.asarray(data[cam_key])
        v0 = np.asarray(data[vec_keys[0]]).reshape(-1)
        v1 = np.asarray(data[vec_keys[1]]).reshape(-1)
        vec = np.concatenate([v0, v1], axis=0)
    
    # Convert camera to CHW format and normalize to [0, 1]
    if cam.ndim != 3:
        raise AssertionError(f"Camera must be 3D, got {cam.shape}")
    
    if cam.shape[-1] in (1, 3, 4):  # HWC format
        cam = np.transpose(cam, (2, 0, 1))  # Convert to CHW
    
    cam = cam.astype(np.float32, copy=False)
    if cam.max() > 1.5:  # Likely uint8 [0..255]
        cam = cam / 255.0
    
    vec = vec.astype(np.float32, copy=False)
    
    return cam, vec


def relocate_agents(env):
    """Get sorted list of agent IDs"""
    return sorted(list(env.agents))


class RunningMeanStd:
    """Track running mean and std for reward normalization"""
    def __init__(self):
        self.mean = 0.0
        self.var = 1.0
        self.count = 1e-4
    
    def update(self, x):
        batch_mean = np.mean(x)
        batch_var = np.var(x)
        batch_count = len(x) if hasattr(x, '__len__') else 1
        
        delta = batch_mean - self.mean
        tot_count = self.count + batch_count
        
        self.mean += delta * batch_count / tot_count
        m_a = self.var * self.count
        m_b = batch_var * batch_count
        m2 = m_a + m_b + delta**2 * self.count * batch_count / tot_count
        self.var = m2 / tot_count
        self.count = tot_count
    
    @property
    def std(self):
        return np.sqrt(self.var)


print("✓ Utility functions loaded")

✓ Utility functions loaded


## 4. Initialize Unity Environment

In [5]:
# Load Unity environment
NO_GRAPHICS = False  # Set to False to see visualization
ENV_PATH = './Env/FinalLevel/DroneFlightv1'

print("Loading Unity environment...")
env = UE(file_name=ENV_PATH, seed=SEED, no_graphics=NO_GRAPHICS)
env = UPZBE(env)

# Get environment info
obs = env.reset()
agents = relocate_agents(env)
num_agents = len(agents)

# Get observation and action spaces
cam_shape = env.observation_space(agents[0])[1].shape
vec_dim = (env.observation_space(agents[0])[0].shape[0] + 
           env.observation_space(agents[0])[2].shape[0])
vec_shape = (vec_dim,)
action_shape = env.action_space(agents[0]).shape

print("\n✓ Environment initialized")
print(f"  Number of agents: {num_agents}")
print(f"  Camera shape: {cam_shape}")
print(f"  Vector dim: {vec_dim}")
print(f"  Action dim: {action_shape[0]}")

# Create blank observations for missing agents
blank_cam = np.zeros(cam_shape, dtype=np.float32)
blank_vec = np.zeros(vec_shape, dtype=np.float32)

Loading Unity environment...

✓ Environment initialized
  Number of agents: 4
  Camera shape: (4, 84, 84)
  Vector dim: 92
  Action dim: 4


## 5. Initialize MAPPO Agent

In [6]:
print("Initializing MAPPO agent...")

agent = MAPPOAgent(
    camera_shape=cam_shape,
    vector_shape=vec_shape,
    action_dim=action_shape[0],
    num_agents=num_agents,
    config=config
)

# Load pretrained feature extractor if available
PRETRAINED_PATH = "SavedModels/feature_extractor_contrastive_init.pth"
if os.path.exists(PRETRAINED_PATH):
    print(f"\n✓ Loading pretrained features from {PRETRAINED_PATH}")
    state_dict = torch.load(PRETRAINED_PATH, map_location=device)
    agent.vision_encoder.load_state_dict(state_dict, strict=False)
    print("✓ Pretrained features loaded successfully")
else:
    print(f"\n Pretrained features not found at {PRETRAINED_PATH}")
    print("   Training from scratch (will take longer)")

# Count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("\n✓ Agent initialized")
print(f"  Actor parameters: {count_parameters(agent.actor):,}")
print(f"  Critic parameters: {count_parameters(agent.critic):,}")
print(f"  Vision encoder parameters: {count_parameters(agent.vision_encoder):,}")
print(f"  Total trainable parameters: {count_parameters(agent.actor) + count_parameters(agent.critic) + count_parameters(agent.vision_encoder):,}")

Initializing MAPPO agent...

✓ Loading pretrained features from SavedModels/feature_extractor_contrastive_init.pth
✓ Pretrained features loaded successfully

✓ Agent initialized
  Actor parameters: 167,432
  Critic parameters: 1,053,700
  Vision encoder parameters: 389,664
  Total trainable parameters: 1,610,796


## 5.1 Resume Training (Optional)

If you want to resume training from a checkpoint, set `RESUME_TRAINING = True` and specify the checkpoint path.

In [7]:
# ==================== RESUME TRAINING CONFIGURATION ====================
RESUME_TRAINING = False  # Set to True to resume from checkpoint
CHECKPOINT_PATH = "saved_models_mappo/mappo_final.pth"  # Update this path
# ========================================================================

if RESUME_TRAINING:
    if os.path.exists(CHECKPOINT_PATH):
        print(f"\n{'='*60}")
        print("  RESUMING FROM CHECKPOINT")
        print(f"{'='*60}")
        print(f"Loading checkpoint: {CHECKPOINT_PATH}")
        
        agent.load(CHECKPOINT_PATH)
        
        print("✓ Checkpoint loaded successfully")
        print("\nNote: You may want to adjust the following in the training loop:")
        print("  - total_steps (to continue from where you left off)")
        print("  - num_updates (checkpoint_steps / rollout_length)")
        print("  - best_reward (to your best known reward)")
        print(f"{'='*60}\n")
    else:
        print(f"\n⚠️  WARNING: Checkpoint not found at {CHECKPOINT_PATH}")
        print("   Starting training from scratch\n")
        RESUME_TRAINING = False
else:
    print("\n✓ Starting fresh training (no checkpoint loaded)\n")


✓ Starting fresh training (no checkpoint loaded)



## 6. Initialize Rollout Buffer

In [8]:
print("Initializing rollout buffer...")

buffer = RolloutBuffer(
    num_steps=config['rollout_length'],
    num_agents=num_agents,
    obs_shape=(agent.encoded_obs_dim,),
    action_dim=action_shape[0],
    gamma=config['gamma'],
    gae_lambda=config['gae_lambda']
)

print(f"✓ Buffer created (capacity: {config['rollout_length']} steps)")
print(f"  Memory per rollout: ~{(config['rollout_length'] * num_agents * agent.encoded_obs_dim * 4) / 1e6:.1f} MB")

Initializing rollout buffer...
✓ Buffer created (capacity: 2048 steps)
  Memory per rollout: ~12.6 MB


## 7. Initialize Logging (Weights & Biases)

In [None]:
if USE_WANDB:
    run_name = f"mappo_no_curriculum_{dt.datetime.now():%Y%m%d_%H%M%S}"
    wandb.init(
        project=os.getenv("WANDB_PROJECT", "MAPPO_Drones"),
        entity=os.getenv("WANDB_ENTITY", "fede-"),
        name=run_name,
        config=config
    )
    print(f"✓ W&B initialized: {run_name}")
    print(f"  View at: https://wandb.ai/{wandb.run.entity}/{wandb.run.project}/runs/{wandb.run.id}")
else:
    print("W&B logging disabled")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrullofederico16[0m ([33mfede-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✓ W&B initialized: mappo_no_curriculum_20251121_120751
  View at: https://wandb.ai/fede-/MAPPO_Drones/runs/xl5ci91p


## 8. Training Loop

This cell runs the main training loop:
1. **Collection Phase**: Collect `rollout_length` steps (2048)
2. **Advantage Computation**: Calculate advantages using GAE
3. **Update Phase**: Train policy for `ppo_epochs` (4) epochs
4. **Logging**: Track metrics and save checkpoints

**Note:** This will run for ~24 hours if training to 3M steps. You can:
- Stop anytime with Interrupt Kernel
- Resume later using saved checkpoints
- Reduce `config['max_steps']` for shorter training

In [None]:
# Training state
total_steps = 0
num_updates = 0
episode_rewards = []
episode_lengths = []
episode_successes = []

# Current episode tracking
current_episode_reward = np.zeros(num_agents)
current_episode_length = 0

# Reward normalization (optional)
reward_normalizer = RunningMeanStd() if NORMALIZE_REWARDS else None

# Best model tracking
best_reward = -float('inf')
success_rate = 0.0

# Reset environment
obs = env.reset()
agents = relocate_agents(env)

print("="*60)
print("STARTING TRAINING")
print("="*60)
print(f"Target steps: {config['max_steps']:,}")
print(f"Rollout length: {config['rollout_length']}")
print(f"Update every: {config['rollout_length']} steps")
print(f"PPO epochs per update: {config['ppo_epochs']}")
print(f"\nExpected updates: {config['max_steps'] // config['rollout_length']:,}")
print(f"Estimated time: ~{config['max_steps'] / 125_000:.1f} hours")
print("\nPress 'Interrupt Kernel' to stop training at any time.")
print(f"Training will save checkpoints every {SAVE_EVERY} updates.")
print("="*60)

try:
    while total_steps < config['max_steps']:

        # ================================================================
        # COLLECTION PHASE: Gather trajectories
        # ================================================================
        
        for step in range(config['rollout_length']):
            # Check if environment needs reset
            if not obs or len(obs) == 0:
                obs = env.reset()
                agents = relocate_agents(env)
                current_episode_reward = np.zeros(num_agents)
                current_episode_length = 0
            
            # Get live agents
            live_agents = relocate_agents(env)
            
            # Collect observations for all agents
            camera_obs = np.zeros((num_agents, *cam_shape), dtype=np.float32)
            vector_obs = np.zeros((num_agents, *vec_shape), dtype=np.float32)
            
            for i, agent_id in enumerate(agents):
                if agent_id in obs:
                    cam, vec = get_agent_obs(obs, agent_id)
                else:
                    cam, vec = blank_cam, blank_vec
                camera_obs[i] = cam
                vector_obs[i] = vec
            
            # Encode observations
            encoded_obs = agent.encode_observations(camera_obs, vector_obs)
            
            # Get actions from policy
            actions, log_probs, values = agent.get_action(
                camera_obs,
                vector_obs,
                deterministic=False
            )
            
            # Step environment
            action_dict = {agent_id: action for agent_id, action in zip(agents, actions)}
            next_obs, reward_dict, done_dict, info_dict = env.step(action_dict)
            
            # Collect rewards and dones
            rewards = np.array([reward_dict.get(a, 0.0) for a in agents])
            dones = np.array([done_dict.get(a, False) for a in agents], dtype=np.float32)

            current_episode_reward += rewards

            # Clip rewards
            train_rewards = np.clip(rewards, -config['reward_clip'], config['reward_clip'])

            #Compute intrinsic curiosity rewards
            camera_obs_next = np.zeros((num_agents, *cam_shape), dtype=np.float32)
            vector_obs_next = np.zeros((num_agents, *vec_shape), dtype=np.float32)

            for i, agent_id in enumerate(agents):
                if agent_id in next_obs:
                    cam_next, vec_next = get_agent_obs(next_obs, agent_id)
                else:
                    cam_next, vec_next = blank_cam, blank_vec
                camera_obs_next[i] = cam_next
                vector_obs_next[i] = vec_next

            encoded_obs = agent.encode_observations(camera_obs, vector_obs)
            encoded_obs_next = agent.encode_observations(camera_obs_next, vector_obs_next)

            intrinsic_rewards = agent.compute_intrinsic_rewards(
                encoded_obs,
                encoded_obs_next,
                torch.from_numpy(actions).float().to(device)
            )

            total_rewards = train_rewards + intrinsic_rewards * config['curiosity_coef']
            
            # Normalize rewards (optional)
            if NORMALIZE_REWARDS:
                reward_normalizer.update(total_rewards)
                total_rewards = (total_rewards - reward_normalizer.mean) / (reward_normalizer.std + 1e-8)
            
            # Store transition in buffer
            buffer.store(
                obs=encoded_obs.detach().cpu().numpy(),
                next_obs=encoded_obs_next.detach().cpu().numpy(),
                action=actions,
                reward=total_rewards,
                done=dones,
                value=values,
                log_prob=log_probs
            )
            
            # Update episode statistics
            current_episode_length += 1
            total_steps += 1
            
            # Check for episode end
            if any(dones) or all(done_dict.values()):
                mean_reward = current_episode_reward.mean()
                episode_rewards.append(mean_reward)
                episode_lengths.append(current_episode_length)
                
                # Check success (adjust threshold based on your task)
                success = np.any(rewards > 15.0)
                episode_successes.append(float(success))
                
                # Reset
                obs = env.reset()
                agents = relocate_agents(env)
                current_episode_reward = np.zeros(num_agents)
                current_episode_length = 0
            else:
                obs = next_obs
        
        # ================================================================
        # UPDATE PHASE: Train policy with collected data
        # ================================================================
        
        # Get final value estimates for GAE
        camera_obs_final = np.zeros((num_agents, *cam_shape), dtype=np.float32)
        vector_obs_final = np.zeros((num_agents, *vec_shape), dtype=np.float32)
        
        for i, agent_id in enumerate(agents):
            if agent_id in obs:
                cam, vec = get_agent_obs(obs, agent_id)
            else:
                cam, vec = blank_cam, blank_vec
            camera_obs_final[i] = cam
            vector_obs_final[i] = vec
        
        _, _, last_values = agent.get_action(camera_obs_final, vector_obs_final)
        
        # Compute returns and advantages using GAE
        buffer.compute_returns_and_advantages(last_values)
        
        # Update policy
        train_stats = agent.train(buffer)
        num_updates += 1
        
        # ================================================================
        # LOGGING PHASE: Record metrics
        # ================================================================
        
        if num_updates % LOG_EVERY == 0:
            # Compute statistics
            mean_reward = np.mean(episode_rewards[-100:]) if episode_rewards else 0.0
            mean_length = np.mean(episode_lengths[-100:]) if episode_lengths else 0.0
            success_rate = np.mean(episode_successes[-100:]) if episode_successes else 0.0
            
            # Console logging
            print(f"\nStep {total_steps:,} | Update {num_updates}")
            print(f"  Reward (100ep):   {mean_reward:8.2f}")
            print(f"  Success rate:     {success_rate:8.1%}")
            print(f"  Episode length:   {mean_length:8.1f}")
            print(f"  Policy loss:      {train_stats['policy_loss']:8.4f}")
            print(f"  Value loss:       {train_stats['value_loss']:8.4f}")
            print(f"  Entropy:          {train_stats['entropy']:8.4f}")
            print(f"  KL divergence:    {train_stats['approx_kl']:8.4f}")
            print(f"  Clip fraction:    {train_stats['clip_fraction']:8.1%}")
            print(f"  Explained var:    {train_stats['explained_variance']:8.1%}")
            
            # W&B logging
            if USE_WANDB:
                wandb.log({
                    'train/reward_mean': mean_reward,
                    'train/success_rate': success_rate,
                    'train/episode_length': mean_length,
                    'train/policy_loss': train_stats['policy_loss'],
                    'train/value_loss': train_stats['value_loss'],
                    'train/entropy': train_stats['entropy'],
                    'train/approx_kl': train_stats['approx_kl'],
                    'train/clip_fraction': train_stats['clip_fraction'],
                    'train/explained_variance': train_stats['explained_variance'],
                    'train/total_steps': total_steps,
                }, step=total_steps)
        
        # ================================================================
        # CHECKPOINT PHASE: Save model
        # ================================================================
        
        if num_updates % SAVE_EVERY == 0:
            save_path = SAVE_DIR / f"mappo_checkpoint_{total_steps:08d}.pth"
            agent.save(save_path)
            print(f"  ✓ Checkpoint saved: {save_path.name}")
        
        # Save best model
        if episode_rewards and mean_reward > best_reward:
            best_reward = mean_reward
            save_path = SAVE_DIR / "mappo_best.pth"
            agent.save(save_path)
            print(f"  ✓ New best model saved: {mean_reward:.2f}")


except KeyboardInterrupt:
    print("\n" + "="*60)
    print("  TRAINING INTERRUPTED")
    print("="*60)
    print(f"Completed {total_steps:,} steps ({num_updates} updates)")
    print("Saving final checkpoint...")

# Save final model
final_path = SAVE_DIR / "mappo_final.pth"
agent.save(final_path)
print(f"\n✓ Final model saved: {final_path}")

print("\n" + "="*60)
print("  TRAINING COMPLETE")
print("="*60)
print(f"Total steps: {total_steps:,}")
print(f"Total updates: {num_updates}")
if episode_rewards:
    print(f"Final reward: {np.mean(episode_rewards[-100:]):.2f}")
if episode_successes:
    print(f"Final success rate: {np.mean(episode_successes[-100:]):.1%}")
print("="*60)

STARTING TRAINING
Target steps: 3,000,000
Rollout length: 2048
Update every: 2048 steps
PPO epochs per update: 3

Expected updates: 1,464
Estimated time: ~24.0 hours

Press 'Interrupt Kernel' to stop training at any time.
Training will save checkpoints every 10 updates.

DEBUG: Buffer returned shapes:
  obs:           torch.Size([2048, 4, 384])
  actions:       torch.Size([2048, 4, 4])
  returns:       torch.Size([2048, 4])
  advantages:    torch.Size([2048, 4])
  old_log_probs: torch.Size([2048, 4])

Expected shapes:
  obs:           (2048, 4, 384)
  actions:       (2048, 4, 4)
  returns:       (2048, 4)


Step 2,048 | Update 1
  Reward (100ep):      88.50
  Success rate:         0.0%
  Episode length:      110.7
  Policy loss:        0.0039
  Value loss:        79.1156
  Entropy:            5.6757
  KL divergence:      0.0059
  Clip fraction:        2.3%
  Explained var:       55.5%
Model saved to saved_models_mappo\mappo_best.pth
  ✓ New best model saved: 88.50

DEBUG: Buffer return

## 9. Close Environment

In [None]:
env.close()
print("✓ Environment closed")

if USE_WANDB:
    wandb.finish()
    print("✓ W&B run finished")

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


✓ Environment closed


0,1
train/approx_kl,▂▁▂▁▂▂▁▂▁▂▂▂▂▁▂█▂▂▂▂▁▂▁▁▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁
train/clip_fraction,█▃▆▃▄▅▄▃▅▅▃▄▄▃▃▃▄▇▄▃▄▄▃▄▃▃▁▁▃▁▂▁▂▂▁▂▂▂▁▂
train/entropy,▁▆███▇▆▆▅▅█▇▇▆▆▆▆▅▄▄▄▄▄▄▃▃▃▄▅▅▅▅▅▄▄▄▄▄▄▂
train/episode_length,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▆███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/explained_variance,▁▂▃▄▄▃▃▄▅▄▄▇▁▃███▆▅▅▅▄▅▅▄▄▄▇▇▇▇▇▇▇▇▇▇▇██
train/policy_loss,▅▇▁▁▃▅▃▅█▃▃▅▂▅▄▃▃▃▂▃▃▃▃▂▃▂▂▂▂▁▂▂▃▃▂▂▁▁▁▂
train/reward_mean,▁▁▁▁▂▂▂▂▃▃▄▆▆▇█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/success_rate,▅██▇▇███████████████████▁▂▂▄▃▅▆▇▇▆▆▆▇█▆▇
train/total_steps,▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█████
train/value_loss,▄▅▄▅▅▆▆▇▆▇▇▆▃▁▁▁▁▂▁▁▆▆▆▇█▇▇▇▇▆▅▆▆▅▅▄▄▅▅▅

0,1
train/approx_kl,0.00199
train/clip_fraction,0.01721
train/entropy,2.37577
train/episode_length,85.23
train/explained_variance,0.91888
train/policy_loss,-0.00076
train/reward_mean,42.58429
train/success_rate,0.96
train/total_steps,3000320.0
train/value_loss,52.80027


✓ W&B run finished
