Copyright (c) 2021, salesforce.com, inc.\
All rights reserved.\
SPDX-License-Identifier: BSD-3-Clause\
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

Get started quickly with end-to-end multi-agent RL using WarpDrive! This shows a basic example to create a simple multi-agent Tag environment and get training. 

# Dependencies

We will install the latest version of WarpDrive using the pip package manager.

In [1]:
!pip install --quiet "rl-warp-drive>=1.6.5" "torch==1.10.*" "torchvision==0.11.*" "torchtext==0.11.*"

In [2]:
import torch

assert torch.cuda.device_count() > 0, "This notebook needs a GPU to run!"

In [3]:
from example_envs.tag_continuous.tag_continuous import TagContinuous
from warp_drive.env_wrapper import EnvWrapper
from warp_drive.training.trainer import Trainer

pytorch_cuda_init_success = torch.cuda.FloatTensor(8)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  reg.get_or_register_dtype("bool", np.bool)


In [4]:
# Set logger level e.g., DEBUG, INFO, WARNING, ERROR
import logging

logging.getLogger().setLevel(logging.ERROR)

# Environment, Training, and Model Hyperparameters

In [5]:
# Specify a set of run configurations for your experiments.
# Note: these override some of the default configurations in 'warp_drive/training/run_configs/default_configs.yaml'.
run_config = dict(
    name="tag_continuous",
    # Environment settings
    env=dict(
        num_taggers=5,
        num_runners=20,
        episode_length=100,
        seed=1234,
        use_full_observation=False,
        num_other_agents_observed=10,
        tagging_distance=0.02,
    ),
    # Trainer settings
    trainer=dict(
        num_envs=100,  # number of environment replicas (number of GPU blocks used)
        train_batch_size=10000,  # total batch size used for training per iteration (across all the environments)
        num_episodes=5000,  # total number of episodes to run the training for (can be arbitrarily high!)
    ),
    # Policy network settings
    policy=dict(
        runner=dict(
            to_train=True,  # flag indicating whether the model needs to be trained
            algorithm="A2C",  # algorithm used to train the policy
            gamma=0.98,  # discount rate
            lr=0.005,  # learning rate
            model=dict(
                type="fully_connected", fc_dims=[256, 256], model_ckpt_filepath=""
            ),  # policy model settings
        ),
        tagger=dict(
            to_train=True,
            algorithm="A2C",
            gamma=0.98,
            lr=0.002,
            model=dict(
                type="fully_connected", fc_dims=[256, 256], model_ckpt_filepath=""
            ),
        ),
    ),
    # Checkpoint saving setting
    saving=dict(
        metrics_log_freq=10,  # how often (in iterations) to print the metrics
        model_params_save_freq=5000,  # how often (in iterations) to save the model parameters
        basedir="/tmp",  # base folder used for saving
        name="continuous_tag",  # experiment name
        tag="example",  # experiment tag
    ),
)

# End-to-End Training Loop

In [6]:
# Create a wrapped environment object via the EnvWrapper
# Ensure that use_cuda is set to True (in order to run on the GPU)
env_wrapper = EnvWrapper(
    TagContinuous(**run_config["env"]),
    num_envs=run_config["trainer"]["num_envs"],
    use_cuda=True,
)

# Agents can share policy models: this dictionary maps policy model names to agent ids.
policy_tag_to_agent_id_map = {
    "tagger": list(env_wrapper.env.taggers),
    "runner": list(env_wrapper.env.runners),
}

# Create the trainer object
trainer = Trainer(
    env_wrapper=env_wrapper,
    config=run_config,
    policy_tag_to_agent_id_map=policy_tag_to_agent_id_map,
)

# Perform training!
trainer.train()

# Shut off gracefully
trainer.graceful_close()

  deprecation(




Device: 0
Iterations Completed                    : 1 / 50
Speed performance stats
Mean policy eval time per iter (ms)     :     736.01
Mean action sample time per iter (ms)   :      20.47
Mean env. step time per iter (ms)       :      49.93
Mean training time per iter (ms)        :      52.36
Mean total time per iter (ms)           :     869.70
Mean steps per sec (policy eval)        :   13586.76
Mean steps per sec (action sample)      :  488403.35
Mean steps per sec (env. step)          :  200300.23
Mean steps per sec (training time)      :  190969.84
Mean steps per sec (total)              :   11498.16
Metrics for policy 'runner'
VF loss coefficient                     :    0.01000
Entropy coefficient                     :    0.05000
Total loss                              :    0.82353
Policy loss                             :    1.06061
Value function loss                     :    0.25759
Mean rewards                            :    0.00146
Max. rewards                           