# Multi-Agent Reinforcement Learning.

Multi-agent reinforcement learning (MARL) is a deep learning paradigm that uses two or more agents to train reinforcement learning models. MARL esssentially uses several standard reinforcement learning agents with connected reward functions in order to train each model. 

MARL is primarily used in cooperative and competitive environments such as chess, go, tic-tac-toe etc. 

In this project, we are going to train a MARL model, based off of the Deep Q-Network models we made in class, to play chess using the environment provided by `PettingZoo` an similar environnment to the `gym` environments we used in class, but with support for multiplayer games.

This project will be based on [this tutorial](https://pettingzoo.farama.org/tutorials/tianshou/advanced/) from PettingZoo. 

We will also be using the `Tianshou` framework to create and train our model, this makes creating our model architecture and training functions easier, and allows us to focus on how the multiple agents interact with each other.

# Setup

We need to install PettingZoo, and Tianshou along with some other frameworks to get started. We will also import a lot of libraries as usual.

In [1]:
import sys
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
  !pip install pettingzoo[classic]
  !pip install stable-baselines3[extra]
  !pip install supersuit
  !apt install python-opengl
  !apt install ffmpeg
  !apt install xvfb
  !pip install pyvirtualdisplay
  !apt autoremove
  !pip install tianshou
  !pip install pygame

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pettingzoo[classic]
  Downloading PettingZoo-1.22.3-py3-none-any.whl (816 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m816.1/816.1 KB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting gymnasium>=0.26.0
  Downloading gymnasium-0.27.1-py3-none-any.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 KB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rlcard==1.0.5
  Downloading rlcard-1.0.5.tar.gz (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.1/251.1 KB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hanabi-learning-environment==0.0.4
  Downloading hanabi_learning_environment-0.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/1

In [1]:
import argparse
import os
from copy import deepcopy
from typing import Optional, Tuple

import gym
import numpy as np
import torch
import pygame
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
from torch.utils.tensorboard.writer import SummaryWriter

from pettingzoo.classic import chess_v5

  from .autonotebook import tqdm as notebook_tqdm


We will use the `argparse` library to format and collect the parameters we need for our model.

In [6]:
def get_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=1626)
    parser.add_argument("--eps-test", type=float, default=0.05)
    parser.add_argument("--eps-train", type=float, default=0.1)
    parser.add_argument("--buffer-size", type=int, default=20000)
    parser.add_argument("--lr", type=float, default=1e-4)
    parser.add_argument(
        "--gamma", type=float, default=0.9, help="a smaller gamma favors earlier win"
    )
    parser.add_argument("--n-step", type=int, default=3)
    parser.add_argument("--target-update-freq", type=int, default=320)
    parser.add_argument("--epoch", type=int, default=10) #we only need to train for a few epochs
    parser.add_argument("--step-per-epoch", type=int, default=1000)
    parser.add_argument("--step-per-collect", type=int, default=10)
    parser.add_argument("--update-per-step", type=float, default=0.1)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument(
        "--hidden-sizes", type=int, nargs="*", default=[128, 128, 128, 128]
    )
    parser.add_argument("--training-num", type=int, default=10)
    parser.add_argument("--test-num", type=int, default=10)
    parser.add_argument("--logdir", type=str, default="log")
    parser.add_argument("--render", type=float, default=0.1)
    parser.add_argument(
        "--win-rate",
        type=float,
        default=0.6,
        help="the expected winning rate: Optimal policy can get 0.7",
    )
    parser.add_argument(
        "--watch",
        default=False,
        action="store_true",
        help="no training, " "watch the play of pre-trained models",
    )
    parser.add_argument(
        "--agent-id",
        type=int,
        default=2,
        help="the learned agent plays as the"
        " agent_id-th player. Choices are 1 and 2.",
    )
    parser.add_argument(
        "--resume-path",
        type=str,
        default="",
        help="the path of agent pth file " "for resuming from a pre-trained agent",
    )
    parser.add_argument(
        "--opponent-path",
        type=str,
        default="",
        help="the path of opponent agent pth file "
        "for resuming from a pre-trained agent",
    )
    parser.add_argument(
        "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu"
    )
    return parser

In [7]:
def get_args() -> argparse.Namespace:
    parser = get_parser()
    return parser.parse_known_args()[0]

In [8]:
def get_env(render_mode=None):
    return PettingZooEnv(chess_v5.env(render_mode=render_mode))

# Architecture

Each agent in our model will be a Deep Q-Network.

In [9]:
# This is a rewrite of Tianshou's Net class
# Original source at: https://tianshou.readthedocs.io/en/master/_modules/tianshou/utils/net/common.html
class Network(nn.Module):
  def __init__(self, state_shape, action_shape, hidden_sizes, device="cpu", num_atoms=1)->None:
    super(Network, self).__init__()
    #get input and output dimensions from state and action space
    input_dim = int(np.prod(state_shape))
    output_dim = int(np.prod(action_shape)) * num_atoms
    # the size of our Linear layers is determined by the hidden_sizes param
    # We use ReLU activations as usual
    hidden_sizes = [input_dim] + list(hidden_sizes)
    model = []
    for in_dim, out_dim in zip(hidden_sizes[:-1], hidden_sizes[1:]):
        model += [nn.Linear(in_dim, out_dim)]
        model += [nn.ReLU()]
    model += [nn.Linear(hidden_sizes[-1], output_dim)]
    self.layers = nn.Sequential(*model)
    self.device = device
        

  def forward(self, x, state=None, info={}):
    if self.device is not None:
      x = torch.as_tensor(x, device=self.device, dtype=torch.float32)
    x = x.flatten(1)
    logits = self.layers(x)
    return logits, state

In [10]:
def get_agents(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gym.spaces.Dict)
        else env.observation_space
    )
    args.state_shape = (
        observation_space["observation"].shape or observation_space["observation"].n
    )
    args.action_shape = env.action_space.shape or env.action_space.n
    if agent_learn is None:
        # model
        net = Network(
            args.state_shape,
            args.action_shape,
            hidden_sizes=args.hidden_sizes,
            device=args.device,
        ).to(args.device)
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=args.lr)
        agent_learn = DQNPolicy(
            net,
            optim,
            args.gamma,
            args.n_step,
            target_update_freq=args.target_update_freq,
        )
        if args.resume_path:
            if args.device == "cpu":
                agent_learn.load_state_dict(torch.load(args.resume_path, map_location="cpu"))
            else:
                agent_learn.load_state_dict(torch.load(args.resume_path))

    if agent_opponent is None:
        if args.opponent_path:
            agent_opponent = deepcopy(agent_learn)
            if args.device == "cpu":
                agent_learn.load_state_dict(torch.load(args.opponent_path, map_location="cpu"))
            else:
                agent_learn.load_state_dict(torch.load(args.opponent_path))
        else:
            agent_opponent = RandomPolicy() #change between RandomPolicy() and agent_learn to train against random agent or self play

    if args.agent_id == 1:
        agents = [agent_learn, agent_opponent]
    else:
        agents = [agent_opponent, agent_learn]
    policy = MultiAgentPolicyManager(agents, env)
    return policy, optim, env.agents

# Training

We will use Adam for optimization as usual.

In [11]:
def train_agent(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[dict, BasePolicy]:
    # ======== environment setup =========
    train_envs = DummyVectorEnv([get_env for _ in range(args.training_num)])
    test_envs = DummyVectorEnv([get_env for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)

    # ======== agent setup =========
    policy, optim, agents = get_agents(
        args, agent_learn=agent_learn, agent_opponent=agent_opponent, optim=optim
    )

    # ======== collector setup =========
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)),
        exploration_noise=True,
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size * args.training_num)

    # ======== tensorboard logging setup =========
    log_path = os.path.join(args.logdir, "chess", "dqn")
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = TensorboardLogger(writer)

    # ======== callback functions used during training =========
    def save_best_fn(policy):
        if hasattr(args, "model_save_path"):
            model_save_path = args.model_save_path
        else:
            model_save_path = os.path.join(
                args.logdir, "chess", "dqn", "policy.pth"
            )
        torch.save(
            policy.policies[agents[args.agent_id - 1]].state_dict(), model_save_path
        )

    def stop_fn(mean_rewards):
        return mean_rewards >= args.win_rate

    def train_fn(epoch, env_step):
        policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_train)

    def test_fn(epoch, env_step):
        policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_test)

    def reward_metric(rews):
        return rews[:, args.agent_id - 1]

    # trainer
    result = offpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        args.step_per_collect,
        args.test_num,
        args.batch_size,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=args.update_per_step,
        logger=logger,
        test_in_train=False,
        reward_metric=reward_metric,
    )

    return result, policy.policies[agents[args.agent_id - 1]]

In [12]:
# ======== a test function that tests a pre-trained agent ======
def watch(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
) -> None:
    env = DummyVectorEnv([lambda: get_env(render_mode="human")])
    policy, optim, agents = get_agents(
        args, agent_learn=agent_learn, agent_opponent=agent_opponent
    )
    policy.eval()
    policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_test)
    collector = Collector(policy, env, exploration_noise=True)
    result = collector.collect(n_episode=1, render=args.render)
    rews, lens = result["rews"], result["lens"]
    print(f"Final reward: {rews[:, args.agent_id - 1].mean()}, length: {lens.mean()}")

Now we can actually train and run our model. 

In [13]:
args = get_args()

In [None]:
# basic trainer
# result, agent = train_agent(args)

# mixed random and self play trainer
self_epochs = 10 #number of epochs to self play
rand_epochs = 5 #number of epochs to play against a random player
rand_agent = RandomPolicy()

args.epoch = rand_epochs
result, agent = train_agent(args, agent_opponent=rand_agent)

for i in range(5):
  #self play
  args.epoch = self_epochs
  result, agent = train_agent(args, agent_learn=agent, agent_opponent=agent)

  #random agent
  args.epoch = rand_epochs
  result, agent = train_agent(args, agent_learn=agent, agent_opponent=rand_agent)

# Watching a Game

From this cell, we can either run our existing model against itself or a random agent, or input our own models to play against.

In [20]:
#use this line to test against preselected opponent in get_agents
# watch(args, agent)

# uncomment the following lines to test pretrained agents
args.resume_path = "mixed_train.pth"
args.opponent_path = "self_play.pth"
# args.agent_id = 2 # 1 for white, 2 for black
watch(args)


Final reward: 0.0, length: 25.0
