Copyright (c) 2021, salesforce.com, inc.\
All rights reserved.\
SPDX-License-Identifier: BSD-3-Clause\
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

Get started quickly with end-to-end multi-agent RL using WarpDrive! This shows a basic example to create a simple multi-agent Tag environment and get training. For more configuration options and indepth explanations, check out the other tutorials and source code.

**Try this notebook on [Colab](http://colab.research.google.com/github/salesforce/warp-drive/blob/master/tutorials/simple-end-to-end-example.ipynb)!**

## ⚠️ PLEASE NOTE:
This notebook runs on a GPU runtime.\
If running on Colab, choose Runtime > Change runtime type from the menu, then select 'GPU' in the dropdown.

### Dependencies

You can install the warp_drive package using

- the pip package manager, OR
- by cloning the warp_drive package and installing the requirements.

On Colab, we will do the latter.

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    ! git clone https://github.com/salesforce/warp-drive.git 
    % cd warp-drive
    ! pip install -e .
else:
    ! pip install rl_warp_drive

In [None]:
import torch

from example_envs.tag_continuous.tag_continuous import TagContinuous
from warp_drive.env_wrapper import EnvWrapper
from warp_drive.training.trainer import Trainer
from warp_drive.training.utils.data_loader import create_and_push_data_placeholders

pytorch_cuda_init_success = torch.cuda.FloatTensor(8)

# Environment, Training, and Model Hyperparameters

In [None]:
run_config = dict(
    name = "tag_continuous",
    
    # Environment settings
    env = dict(  
        num_taggers = 5,
        num_runners = 20,
        episode_length = 100,
        seed = 1234,
        use_full_observation = False,
        num_other_agents_observed = 10,
        tagging_distance = 0.02,
    ),

    # Trainer settings
    trainer = dict(
        num_envs = 100,  # Number of environment replicas (numbre of GPU blocks used)
        train_batch_size = 10000,  # total batch size used for training per iteration (across all the environments)
        num_episodes = 5000,  # Total number of episodes to run the training for (can be arbitrarily high!)
        algorithm = "A2C",  # trainer algorithm
        vf_loss_coeff = 1,  # loss coefficient for the value function loss
        entropy_coeff = 0.05,  # coefficient for the entropy component of the loss
        clip_grad_norm = True,  # fla indicating whether to clip the gradient norm or not
        max_grad_norm = 0.5,  # when clip_grad_norm is True, the clip level
        normalize_advantage = False, # flag indicating whether to normalize advantage or not
        normalize_return = False # flag indicating whether to normalize return or not
    ), 
    
    # Policy network settings
    policy =  dict(
        runner = dict(
            to_train = True,
            name = "fully_connected",
            gamma = 0.98,  # discount rate gamms
            lr = 0.005,  # learning rate
            model = dict(     
                fc_dims = [256, 256],  # dimension(s) of the fully connected layers as a list
                model_ckpt_filepath = ""  # load model parameters from a saved checkpoint (if specified)
            )
        ),
        tagger = dict(
            to_train = True,
            name = "fully_connected",
            gamma = 0.98,
            lr = 0.002,
            model = dict(
                fc_dims = [256, 256],
                model_ckpt_filepath = ""
            )
        )
    ),
    
    # Checkpoint saving setting
    saving = dict(
        print_metrics_freq = 10,  # How often (in iterations) to print the metrics
        save_model_params_freq = 5000,  # How often (in iterations) to save the model parameters
        basedir = "/tmp",  # base folder used for saving
        tag = "continuous-tag-experiment",
    )
)

# End-to-End Training Loop

In [None]:
# Create a wrapped environment object via the EnvWrapper
# Ensure that use_cuda is set to True (in order to run on the GPU)
env_wrapper = EnvWrapper(
    TagContinuous(**run_config["env"]),
    num_envs=run_config["trainer"]["num_envs"], 
    use_cuda=True
)

# Agents can share policy models: this dictionary maps policy model names to agent ids.
policy_tag_to_agent_id_map = {
    "tagger": list(env_wrapper.env.taggers),
    "runner": list(env_wrapper.env.runners),
}

# Create the trainer object
trainer = Trainer(
    env_wrapper=env_wrapper,
    config=run_config,
    policy_tag_to_agent_id_map=policy_tag_to_agent_id_map,
)

# Create and push data placeholders to the device
create_and_push_data_placeholders(
    env_wrapper, 
    policy_tag_to_agent_id_map, 
    training_batch_size_per_env=trainer.training_batch_size_per_env
)

# Perform training!
trainer.train()

# Shut off gracefully
trainer.graceful_close()

### Learn more and explore our tutorials

To learn more about WarpDrive, take a look at these tutorials
- [WarpDrive basics](https://www.github.com/salesforce/warp-drive/blob/master/tutorials/tutorial-1-warp_drive_basics.ipynb)
- [WarpDrive sampler](https://www.github.com/salesforce/warp-drive/blob/master/tutorials/tutorial-2-warp_drive_sampler.ipynb)
- [WarpDrive reset and log](https://www.github.com/salesforce/warp-drive/blob/master/tutorials/tutorial-3-warp_drive_reset_and_log.ipynb)
- [Creating custom environments](https://www.github.com/salesforce/warp-drive/blob/master/tutorials/tutorial-4-create_custom_environments.ipynb)
- [Training with WarpDrive](https://www.github.com/salesforce/warp-drive/blob/master/tutorials/tutorial-5-training_with_warp_drive.ipynb)