In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
!pip install git+https://github.com/Farama-Foundation/MAgent2

Collecting git+https://github.com/Farama-Foundation/MAgent2
  Cloning https://github.com/Farama-Foundation/MAgent2 to /tmp/pip-req-build-hwe7j8yk
  Running command git clone --filter=blob:none --quiet https://github.com/Farama-Foundation/MAgent2 /tmp/pip-req-build-hwe7j8yk
  Resolved https://github.com/Farama-Foundation/MAgent2 to commit b2ddd49445368cf85d4d4e1edcddae2e28aa1406
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting pygame>=2.1.0 (from magent2==0.3.3)
  Downloading pygame-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pygame-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: magent2
  Building wheel f

In [8]:
import torch.nn as nn
import torch

class ConvBNAvgPool(nn.Module):
    def __init__(self, in_channels, outs_channels, conv):
        """
        Args:
            in_channels: int
            outs_channels: List
            conv:
                2 for conv2d
                3 for conv3d
        """
        super(ConvBNAvgPool, self).__init__()

        if conv == 3:
            Conv = nn.Conv3d
            BatchNorm = nn.BatchNorm3d
            avg_pool = nn.AdaptiveAvgPool3d
            size_avg_pool = (1,1,1)
        else:
            Conv = nn.Conv2d
            BatchNorm = nn.BatchNorm2d
            avg_pool = nn.AdaptiveAvgPool2d
            size_avg_pool = (1,1)

        channels = [in_channels] + outs_channels
        
        self.model = nn.Sequential(
            *[nn.Sequential(
                Conv(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
                BatchNorm(out_channels),
                nn.ReLU()
            ) for in_channels, out_channels in zip(channels[:-1], channels[1:])]
        )
        
        # Lớp Average Pooling
        self.avgpool = avg_pool(size_avg_pool)  # Chuyển kích thước về (1, 1, 1)

    def forward(self, x):
        # Áp dụng Conv3D, BatchNorm và ReLU cho mỗi lớp
        x = self.model(x)
        
        # Áp dụng Avg Pooling để chuẩn hóa về (batch_size, out_channels)
        x = self.avgpool(x)
        
        # Lấy output có kích thước (batch_size, out_channels)
        x = x.view(x.size(0), -1)
        return x


In [9]:
class Mlp(nn.Module):
    def __init__(self, in_features, outs_features):
        """
        Args:
            in_channels: int
            outs_channels: List
        """
        super().__init__()

        channels = [in_features] + outs_features

        self.mlp = nn.Sequential(
            *[nn.Sequential(
                nn.Linear(in_channels, out_channels),
                nn.ReLU()
            ) for in_channels, out_channels in zip(channels[:-1], channels[1:])]
        )

    def forward(self, x):
        x = self.mlp(x)

        return x
        
        

In [10]:
class Critic(nn.Module):
    """
    Args:
        obs: Bx5xNx13x13
        action: BxN
        
    Return:
        Value: Bx1
    """
    
    def __init__(self, in_channels_obs, outs_channels_obs, in_channels_act, num_layer_mlp):
        super().__init__()

        conv = 3
        self.model_obs = ConvBNAvgPool(in_channels_obs, outs_channels_obs, conv) #Bxout_channels

        out_channels_act = outs_channels_obs[-1]
        self.model_act = nn.Linear(in_channels_act, out_channels_act)

        in_channels_mlp = out_channels_act*2
        outs_channels_mlp = [in_channels_mlp*2]*num_layer_mlp + [1]
        
        self.mlp = Mlp(in_channels_mlp, outs_channels_mlp)
        
    def forward(self, obs, act):
        obs = self.model_obs(obs)
        act = self.model_act(act)
        # concat obs, act
        out = torch.cat((obs, act), dim=-1)
        out = self.mlp(out)
        return out

In [11]:
class Actor(nn.Module):
    """
    Agent chỉ quan sát môi trường xung quanh nó để đưa ra hành động
    Args:
        obs: Bx5x13x13
        
    Return:
        Value: Bxlen_action_space
    """
    def __init__(self, in_channels, outs_channels, len_action_space, num_layer_mlp):
        super().__init__()

        in_channels_mlp = outs_channels[-1]
        outs_channels_mlp = [in_channels_mlp*2]*num_layer_mlp + [len_action_space]
        
        mlp = Mlp(in_channels_mlp, outs_channels_mlp)
        
        conv = 2
        self.model = nn.Sequential(
            ConvBNAvgPool(in_channels, outs_channels, conv),
            mlp
        )
    def forward(self, x):
        x = self.model(x)

        return x

In [12]:
from magent2.environments import battle_v4

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

NUM_EPISODES = 100
MAX_EPISODE_LENGTH = 100
MAX_CYCLES = MAX_EPISODE_LENGTH * 3
MAP_SIZE = 45
NUM_AGENTS = 81
LEN_ACTION_SPACE = 21
CAPACITY = 1000
TRAIN_AGENT = 'red'

env = battle_v4.env(map_size=MAP_SIZE, minimap_mode=False, extra_features=False, render_mode='rgb_array', max_cycles=MAX_CYCLES)

In [13]:
import numpy as np

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = {'obs': [], 'act': [], 'reward': [], 'obs1': [], 'act1': []}
        self.size = 0

    def push(self, obs, act, reward, obs1, act1):
        if self.size < self.capacity:
            for key, value in zip(self.buffer.keys(), [obs, act, reward, obs1, act1]):
                self.buffer[key].append(value)
            self.size += 1
        else:
            for key, value in zip(self.buffer.keys(), [obs, act, reward, obs1, act1]):
                self.buffer[key][self.size % self.capacity] = value

    def sample(self, batch_size):
        indices = np.random.choice(self.size, batch_size, replace=False)
        return {key: np.array([self.buffer[key][i] for i in indices]) for key in self.buffer.keys()}


In [14]:
import torch.optim as optim
from tqdm import tqdm
@torch.no_grad()
def update_target_model(model_online, model_target, tau):
    for online, target in zip(model_online.parameters(), model_target.parameters()):
        target.data.lerp_(online.data, tau)

def train(
    model_critics: list, 
    model_actors: list,
    model_target_critics: list,
    model_target_actors: list,
    num_agents: int,
    replay_buffer: ReplayBuffer,
    batch_size: int,
    gamma: float,
    tau: float,
    optimizer_critics,
    optimizer_actors,
    device
):
    mse_loss = nn.MSELoss()
    for agent_i in range(num_agents):
        # Đưa mô hình vào trạng thái train
        model_actors[agent_i].train()
        model_critics[agent_i].train()
        
        batch = replay_buffer.sample(batch_size)
        """
            Chuyển x, a, r, x' về dạng tensor, kích thước của từng loại:
            x: Bx5xNx13x13
            a: BxN
            r: BxN
            x': Bx5xNx13x13
        """
        x_j = torch.tensor(batch['obs'], dtype=torch.float32, device=device).permute(0, 4, 1, 2, 3)
        a_j = torch.tensor(batch['act'], dtype=torch.float32, device=device)
        r_j = torch.tensor(batch['reward'], dtype=torch.float32, device=device)
        x1_j = torch.tensor(batch['obs1'], dtype=torch.float32, device=device).permute(0, 4, 1, 2, 3)
        a1_j = torch.tensor(batch['act1'], dtype=torch.float32, device=device)

        with torch.no_grad():    
            y_j = r_j[:,agent_i] + gamma * model_target_critics[agent_i](x1_j, a1_j).squeeze()

        optimizer_critics[agent_i].zero_grad()
        L = mse_loss(y_j, model_critics[agent_i](x_j, a_j).squeeze())

        L.backward()
        optimizer_critics[agent_i].step()

        grad_a_i = []
        for idx_batch in range(batch_size):
            optimizer_critics[agent_i].zero_grad()

            a_j_idx = a_j[idx_batch].unsqueeze(0).requires_grad_()
            x_j_idx = x_j[idx_batch].unsqueeze(0).detach()

            Q = model_critics[agent_i](x_j_idx, a_j_idx)

            grad_a_i.append(
                torch.autograd.grad(
                    Q, a_j_idx
                )[0].squeeze()[agent_i]
            )

        grad_a_i = torch.stack(grad_a_i).unsqueeze(-1).detach()

        optimizer_actors[agent_i].zero_grad()

        x_j_agent_i = x_j[:, :, agent_i, :, :]

        J = -(grad_a_i[:, agent_i] * model_actors[agent_i](x_j_agent_i).max(dim=-1)[0]).mean()

        J.backward()

        optimizer_actors[agent_i].step()

    
    # Cập nhật tham số mạng mục tiêu với mỗi agent i
    for agent_i in range(num_agents):
        update_target_model(model_critics[agent_i], model_target_critics[agent_i], tau)
        update_target_model(model_actors[agent_i], model_target_actors[agent_i], tau)
        
    print("completed!")
        

In [15]:
import numpy as np
def setup_config():
    num_agents = NUM_AGENTS
    len_action_space = LEN_ACTION_SPACE
    num_layer_mlp = 1

    model_critics = [Critic(5, [32], num_agents, num_layer_mlp).to(device) for i in range(num_agents)]
    model_actors = [Actor(5, [32], len_action_space, num_layer_mlp).to(device) for i in range(num_agents)]

    model_target_critics = [
        Critic(5, [32], num_agents, num_layer_mlp).to(device) for model in model_critics
    ]

    model_target_actors = [
        Actor(5, [32], len_action_space, num_layer_mlp).to(device) for model in model_actors
    ]

    # Sao chép trọng số từ online models sang target models
    for target, source in zip(model_target_critics, model_critics):
        target.load_state_dict(source.state_dict())
    
    for target, source in zip(model_target_actors, model_actors):
        target.load_state_dict(source.state_dict())

    replay_buffer = ReplayBuffer(CAPACITY)

    batch_size = 32

    gamma = 0.9

    tau = 1e-3

    lr = 1e-4
    optimizer_critics = [optim.Adam(model.parameters(), lr) for model in model_critics]
    optimizer_actors = [optim.Adam(model.parameters(), lr) for model in model_actors]

    device = DEVICE
    
    config = {
        'num_agents': num_agents,
        'model_critics': model_critics, 
        'model_actors': model_actors,
        'model_target_critics': model_target_critics,
        'model_target_actors': model_target_actors,
        'replay_buffer': replay_buffer,
        'batch_size': batch_size,
        'gamma': gamma,
        'tau': tau,
        'optimizer_critics': optimizer_critics,
        'optimizer_actors': optimizer_actors,
        'device': device
    }
    return config

In [10]:
import numpy as np
def test_train():
    config = setup_config()
    replay_buffer = config['replay_buffer']
    for i in range(capacity):
        dummy_x = [np.random.rand(13,13,5) for i in range(num_agents)]
        dummy_a = [np.random.rand() for i in range(num_agents)]
        dummy_r = [np.random.rand() for i in range(num_agents)]
        dummy_x1 = [np.random.rand(13,13,5) for i in range(num_agents)]
        dummy_a1 = [np.random.rand() for i in range(num_agents)]

        replay_buffer.push(dummy_x, dummy_a, dummy_r, dummy_x1, dummy_a1)
    train(**config)
# test_train()

100%|██████████| 81/81 [00:07<00:00, 11.19it/s]


completed!


In [26]:
import numpy as np
from tqdm import tqdm
class Transition:
    def __init__(self, num_agents):
        self.num_agents = num_agents
        
        self.obs = self._setup_agents_data()
        self.act = self._setup_agents_data()
        self.reward = self._setup_agents_data()
        self.obs1 = self._setup_agents_data()
        self.act1 = self._setup_agents_data()
        
    def _setup_agents_data(self):
        return {
            'red': {f'red_{i}': None for i in range(self.num_agents)},
            'blue': {f'blue_{i}': None for i in range(self.num_agents)}
        }

    def reset(self):
        """Reset all attributes to their initial empty state."""
        self.obs = self._setup_agents_data()
        self.act = self._setup_agents_data()
        self.reward = self._setup_agents_data()
        self.obs1 = self._setup_agents_data()
        self.act1 = self._setup_agents_data()

    def is_all_none(self):
        """Kiểm tra nếu tất cả các giá trị trong obs, act, reward, obs1 là None."""
        def check_none(data):
            return all(
                value is None 
                for group in data.values() 
                for value in group.values()
            )
        
        return (
            check_none(self.obs) and 
            check_none(self.act) and 
            check_none(self.reward) and 
            check_none(self.obs1) and
            check_none(self.act1)
        )
        
    def _padding(self, lst):
        dummy = [element for element in lst if element is not None][0]

        if isinstance(dummy, np.ndarray):
            shape = dummy.shape
            lst = [np.zeros(shape) if element is None else element for element in lst]
            
        else:
            lst = [0 if element is None else element for element in lst]
            
        return lst
        
    def get_transition(self, handle):
        obs = self._padding(list(self.obs[handle].values()))
        act = self._padding(list(self.act[handle].values()))
        reward = self._padding(list(self.reward[handle].values()))
        obs1 = self._padding(list(self.obs1[handle].values()))
        act1 = self._padding(list(self.act1[handle].values()))
        return (obs, act, reward, obs1, act1)

transition = Transition(NUM_AGENTS)

config = setup_config()

replay = config['replay_buffer']

is_step = {
    1: False,
    2: False,
    3: True
}
def rotate_steps(is_step):
    # Kiểm tra xem có nhiều hơn một giá trị True không
    if sum(is_step.values()) > 1:
        raise ValueError("is_step contains more than one True value")
        
    # Tìm bước hiện tại có giá trị True
    current_step = next((key for key, value in is_step.items() if value), None)
    
    if current_step is None:
        raise ValueError("No active step found in is_step")

    # Đặt giá trị False cho bước hiện tại
    is_step[current_step] = False
    
    # Tìm bước tiếp theo, hoặc quay lại bước đầu tiên nếu đang ở bước cuối
    next_step = current_step + 1 if current_step + 1 in is_step else min(is_step.keys())
    
    # Đặt giá trị True cho bước tiếp theo
    is_step[next_step] = True

    return is_step
    
current_handle = None
current_cycle = -1 #0

reward_episodes = []
#x,a,r,x',a', 1 episode
for episode in tqdm(range(NUM_EPISODES)):
    rewards = []
    env.reset()
    for agent in env.agent_iter():
        agent_handle, agent_indice = agent.split("_")
        observation, reward, termination, truncation, info = env.last()
        
        if agent_handle != current_handle:
            current_handle = agent_handle
            current_cycle += 1
            
            if current_cycle % 2 == 0:
                # Đảm bảo rằng chuyển step sau mỗi cycle và cycle đầu tiên step = 1
                is_step = rotate_steps(is_step)
    
                if current_cycle % 3 == 0:
                    # Đảm bảo rằng lưu trữ lại transition vào replay
                    # Cuối bước Sampling
                    if not transition.is_all_none():
                        replay.push(*transition.get_transition(TRAIN_AGENT))
                        
                    # Training
                    # Code train here
                    train(**config)
                    
                    # Đảm bảo rằng transition reset sau mỗi 3 bước
                    transition.reset()
                
            # current_handle = agent_handle
            # current_cycle += 1
            
        # bước 1: lấy tất cả các observation của đội xanh và đỏ (trong cycle 1)
        if is_step[1]:
            transition.obs[agent_handle][agent] = observation
            action = None if termination or truncation else 0
            env.step(action)
            
        # bước 2: Thực hiện từng hành động của agent (trong cycle 2)
        if is_step[2]:
            if termination or truncation:
                action = None
                
            else:
                if agent_handle == 'red':
                    # Thêm hành động khác tại đây, mặc định là random
                    # action = env.action_space(agent).sample()
                    obs = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(dim=-1)
                    obs = obs.permute(0,3,1,2)
                    action = config['model_actors'][int(agent_indice)](obs).max(dim=-1)[0].squeeze().item()
                    
                else:
                    # Thêm hành động khác tại đây, mặc định là random
                    action = env.action_space(agent).sample()
                    
            transition.act[agent_handle][agent] = action
            env.step(action)
            
        # bước 3: Lấy phần thưởng và quan sát hiện tại của từng agent (trong cycle 3)
        if is_step[3]:
            transition.reward[agent_handle][agent] = reward
            transition.obs1[agent_handle][agent] = observation
            if agent_handle == 'red':
                # Thêm hành động khác tại đây, mặc định là random (Sử dụng model target)
                act1 = env.action_space(agent).sample()
                rewards.append(reward)
                
            else:
                # Thêm hành động khác tại đây, mặc định là random (Sử dụng model target)
                act1 = env.action_space(agent).sample()

            transition.act1[agent_handle][agent] = act1
            
            action = None if termination or truncation else 0
            env.step(action)
        
print(env.agent_selection)

  4%|▍         | 435/10000 [31:44<11:37:49,  4.38s/it]


KeyboardInterrupt: 

In [12]:
replay.size()

400

In [20]:
import torch
import torch.nn.functional as F

def sample_gumbel(shape, eps=1e-20, device='cpu'):
    """
    Lấy mẫu nhiễu Gumbel từ phân phối Gumbel(0, 1).
    """
    U = torch.rand(shape, device=device)
    return -torch.log(-torch.log(U + eps) + eps)

def gumbel_softmax(logits, tau=1.0, hard=False):
    """
    Thực hiện Gumbel Softmax.
    
    Args:
        logits (torch.Tensor): Input tensor, là các logit trước khi qua softmax.
        tau (float): Tham số nhiệt độ.
        hard (bool): Nếu True, lấy mẫu one-hot, ngược lại trả về phân phối xác suất.
        
    Returns:
        torch.Tensor: Xác suất (hoặc mẫu one-hot nếu `hard=True`).
    """
    # Lấy mẫu nhiễu Gumbel
    gumbel_noise = sample_gumbel(logits.shape, device=logits.device)
    # Tính softmax với nhiễu Gumbel
    y = F.softmax((logits + gumbel_noise) / tau, dim=-1)

    if hard:
        # Lấy mẫu one-hot (gần giống argmax)
        y_hard = torch.zeros_like(y)
        y_hard.scatter_(-1, y.argmax(dim=-1, keepdim=True), 1.0)
        # Kết hợp giá trị gradient từ softmax
        y = (y_hard - y).detach() + y
    return y


In [37]:
logits = torch.tensor([[2.0, 2.1, 2.2]])

# Gumbel Softmax với nhiệt độ khác nhau
y_soft = gumbel_softmax(logits, tau=0.01, hard=False)
y_hard = gumbel_softmax(logits, tau=0.01, hard=True)

print(f"Logits: {logits}")
print(f"Soft Gumbel Softmax: {y_soft}")
print(f"Hard Gumbel Softmax (One-Hot): {y_hard}")


Logits: tensor([[2.0000, 2.1000, 2.2000]])
Soft Gumbel Softmax: tensor([[8.2677e-44, 0.0000e+00, 1.0000e+00]])
Hard Gumbel Softmax (One-Hot): tensor([[0., 1., 0.]])
