In [1]:
import torch
import torch.nn as nn
import torch.multiprocessing as mp
from multiprocessing import shared_memory, Lock, Value
import numpy as np
import time

# Define the agent network
class AgentNet(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, output_size=2):
        super(AgentNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        return self.fc2(x)

    def get_flat_weights(self):
        return torch.cat([p.data.view(-1) for p in self.parameters()])

    def load_flat_weights(self, flat_tensor):
        pointer = 0
        for param in self.parameters():
            numel = param.numel()
            param.data.copy_(flat_tensor[pointer:pointer + numel].view_as(param))
            pointer += numel

# Shared dummy environment
class SharedEnv:
    def reset(self):
        return [np.random.randn(4).astype(np.float32) for _ in range(6)]

    def step(self, actions):
        next_obs = [np.random.randn(4).astype(np.float32) for _ in range(6)]
        rewards = [np.random.rand() for _ in range(6)]
        dones = [np.random.rand() > 0.95 for _ in range(6)]
        return next_obs, rewards, dones, {}

# Learner process that executes exactly once per epoch
def learner_process(agent_id, shared_name, shape, lock, shared_epoch, done_epoch):
    shm = shared_memory.SharedMemory(name=shared_name)
    weight_np = np.ndarray(shape, dtype=np.float32, buffer=shm.buf)
    model = AgentNet()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    dummy_input = torch.randn((32, 4))
    dummy_target = torch.randint(0, 2, (32,))
    last_epoch = 0  # No training at epoch 0

    while True:
        current_epoch = shared_epoch.value
        if current_epoch > last_epoch:
            output = model(dummy_input)
            loss = nn.CrossEntropyLoss()(output, dummy_target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            with lock:
                weight_np[:] = model.get_flat_weights().numpy()
            print(f"[Learner {agent_id}] Trained for epoch {current_epoch}")

            done_epoch[agent_id] = current_epoch
            last_epoch = current_epoch
        time.sleep(0.05)

# Main actor loop (runs in main process)
def actor_loop(num_agents, shared_names, shapes, locks, shared_epoch, done_epoch):
    models = [AgentNet() for _ in range(num_agents)]
    shms = [shared_memory.SharedMemory(name=shared_names[i]) for i in range(num_agents)]
    weight_nps = [np.ndarray(shapes[i], dtype=np.float32, buffer=shms[i].buf) for i in range(num_agents)]

    env = SharedEnv()
    obs_list = env.reset()
    epoch = 0
    steps_per_epoch = 10
    step_count = 0

    while True:
        actions = []
        for i in range(num_agents):
            with locks[i]:
                weights_tensor = torch.tensor(weight_nps[i].copy(), dtype=torch.float32)
            models[i].load_flat_weights(weights_tensor)
            obs_tensor = torch.tensor(obs_list[i], dtype=torch.float32)
            with torch.no_grad():
                action = models[i](obs_tensor).argmax().item()
            actions.append(action)

        next_obs, rewards, dones, _ = env.step(actions)
        obs_list = next_obs
        step_count += 1

        for i in range(num_agents):
            print(f"[Agent {i}] Action: {actions[i]} Reward: {rewards[i]:.2f} Done: {dones[i]}")

        if step_count >= steps_per_epoch:
            step_count = 0
            epoch += 1
            shared_epoch.value = epoch
            print(f"[Actor] Epoch {epoch} completed. Waiting for learners...")

            while not all(done_epoch[i] == epoch for i in range(num_agents)):
                time.sleep(0.1)
            print(f"[Actor] All learners completed training for epoch {epoch}")

        time.sleep(0.1)

# Setup function
def main():
    num_agents = 6
    shared_names = []
    shapes = []
    locks = []
    learners = []
    shared_epoch = Value('i', 0)
    done_epoch = mp.Array('i', [0] * num_agents)

    for i in range(num_agents):
        dummy_model = AgentNet()
        flat_weights = dummy_model.get_flat_weights()
        shape = flat_weights.shape
        shm = shared_memory.SharedMemory(create=True, size=flat_weights.numel() * 4)
        weight_np = np.ndarray(shape, dtype=np.float32, buffer=shm.buf)
        weight_np[:] = flat_weights.numpy()

        lock = Lock()
        learner = mp.Process(target=learner_process, args=(i, shm.name, shape, lock, shared_epoch, done_epoch))
        learner.start()

        shared_names.append(shm.name)
        shapes.append(shape)
        locks.append(lock)
        learners.append(learner)

    actor_loop(num_agents, shared_names, shapes, locks, shared_epoch, done_epoch)

    for learner in learners:
        learner.join()

if __name__ == "__main__":
    mp.set_start_method("spawn")
    main()


FileNotFoundError: [WinError 2] The system cannot find the file specified: 'wnsm_a8559af3'

In [1]:
import torch
import torch.nn as nn
import torch.multiprocessing as mp
from multiprocessing import shared_memory, Lock, Value
import numpy as np
import time

class AgentNet(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, output_size=2):
        super(AgentNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        return self.fc2(x)

    def get_flat_weights(self):
        return torch.cat([p.data.view(-1) for p in self.parameters()])

    def load_flat_weights(self, flat_tensor):
        pointer = 0
        for param in self.parameters():
            numel = param.numel()
            param.data.copy_(flat_tensor[pointer:pointer + numel].view_as(param))
            pointer += numel

class SharedEnv:
    def reset(self):
        return [np.random.randn(4).astype(np.float32) for _ in range(6)]

    def step(self, actions):
        next_obs = [np.random.randn(4).astype(np.float32) for _ in range(6)]
        rewards = [np.random.rand() for _ in range(6)]
        dones = [np.random.rand() > 0.95 for _ in range(6)]
        return next_obs, rewards, dones, {}

def learner_process(agent_id, shape, weight_np, lock, shared_epoch, done_epoch):
    model = AgentNet()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    dummy_input = torch.randn((32, 4))
    dummy_target = torch.randint(0, 2, (32,))
    last_epoch = 0

    while True:
        current_epoch = shared_epoch.value
        if current_epoch > last_epoch:
            output = model(dummy_input)
            loss = nn.CrossEntropyLoss()(output, dummy_target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            with lock:
                weight_np[:] = model.get_flat_weights().numpy()
            print(f"[Learner {agent_id}] Trained for epoch {current_epoch}")

            done_epoch[agent_id] = current_epoch
            last_epoch = current_epoch
        time.sleep(0.05)

def actor_loop(num_agents, locks, shared_epoch, done_epoch, weight_nps):
    models = [AgentNet() for _ in range(num_agents)]
    env = SharedEnv()
    obs_list = env.reset()
    epoch = 0
    steps_per_epoch = 10
    step_count = 0

    while True:
        actions = []
        for i in range(num_agents):
            with locks[i]:
                weights_tensor = torch.tensor(weight_nps[i].copy(), dtype=torch.float32)
            models[i].load_flat_weights(weights_tensor)
            obs_tensor = torch.tensor(obs_list[i], dtype=torch.float32)
            with torch.no_grad():
                action = models[i](obs_tensor).argmax().item()
            actions.append(action)

        next_obs, rewards, dones, _ = env.step(actions)
        obs_list = next_obs
        step_count += 1

        for i in range(num_agents):
            print(f"[Agent {i}] Action: {actions[i]} Reward: {rewards[i]:.2f} Done: {dones[i]}")

        if step_count >= steps_per_epoch:
            step_count = 0
            epoch += 1
            shared_epoch.value = epoch
            print(f"[Actor] Epoch {epoch} completed. Waiting for learners...")

            while not all(done_epoch[i] == epoch for i in range(num_agents)):
                time.sleep(0.1)
            print(f"[Actor] All learners completed training for epoch {epoch}")

        time.sleep(0.1)

def setup():
    num_agents = 6
    shared_epoch = Value('i', 0)
    done_epoch = mp.Array('i', [0] * num_agents)
    locks = []
    learners = []
    weight_nps = []

    for i in range(num_agents):
        dummy_model = AgentNet()
        flat_weights = dummy_model.get_flat_weights()
        shape = flat_weights.shape
        shm = shared_memory.SharedMemory(create=True, size=flat_weights.numel() * 4)
        weight_np = np.ndarray(shape, dtype=np.float32, buffer=shm.buf)
        weight_np[:] = flat_weights.numpy()

        lock = Lock()
        learner = mp.Process(target=learner_process, args=(i, shape, weight_np, lock, shared_epoch, done_epoch))
        learner.start()

        locks.append(lock)
        learners.append(learner)
        weight_nps.append(weight_np)

    return num_agents, locks, shared_epoch, done_epoch, learners, weight_nps

# if __name__ == "__main__":
#     mp.set_start_method("spawn")
#     num_agents, locks, shared_epoch, done_epoch, learners, weight_nps = setup()
#     actor_loop(num_agents, locks, shared_epoch, done_epoch, weight_nps)
#     for learner in learners:
#         learner.join()

In [3]:

# mp.set_start_method("spawn")
num_agents, locks, shared_epoch, done_epoch, learners, weight_nps = setup()
actor_loop(num_agents, locks, shared_epoch, done_epoch, weight_nps)
for learner in learners:
    learner.join()

: 

In [None]:
import torch
import torch.nn as nn
import torch.multiprocessing as mp
from multiprocessing import shared_memory, Lock, Value
import numpy as np
import time
from typing import Sequence

# ==== Shared Replay Buffer ====
class SharedReplayBuffer:
    def __init__(self, capacity: int, obs_shape: Sequence[int], name_prefix="rb"):
        self.capacity = capacity
        self.obs_shape = obs_shape
        self.lock = Lock()

        obs_bytes = np.prod((capacity, *obs_shape)) * np.dtype(np.float32).itemsize
        act_bytes = capacity * np.dtype(np.int64).itemsize
        rew_bytes = capacity * np.dtype(np.float32).itemsize
        done_bytes = capacity * np.dtype(np.float32).itemsize

        self.obs_shm = shared_memory.SharedMemory(create=True, size=obs_bytes, name=f"{name_prefix}_obs")
        self.act_shm = shared_memory.SharedMemory(create=True, size=act_bytes, name=f"{name_prefix}_act")
        self.rew_shm = shared_memory.SharedMemory(create=True, size=rew_bytes, name=f"{name_prefix}_rew")
        self.done_shm = shared_memory.SharedMemory(create=True, size=done_bytes, name=f"{name_prefix}_done")

        self.buffer = np.ndarray((capacity, *obs_shape), dtype=np.float32, buffer=self.obs_shm.buf)
        self.actions = np.ndarray(capacity, dtype=np.int64, buffer=self.act_shm.buf)
        self.rewards = np.ndarray(capacity, dtype=np.float32, buffer=self.rew_shm.buf)
        self.dones = np.ndarray(capacity, dtype=np.float32, buffer=self.done_shm.buf)

        self.idx = Value('i', 0)
        self.size = Value('i', 0)

    def add(self, obs, action, reward, done):
        with self.lock:
            i = self.idx.value
            self.buffer[i] = obs
            self.actions[i] = action
            self.rewards[i] = reward
            self.dones[i] = done
            self.idx.value = (i + 1) % self.capacity
            self.size.value = min(self.size.value + 1, self.capacity)

    def sample(self, batch_size: int, stacked_frames: int = 1):
        with self.lock:
            size = self.size.value

        indices = np.random.choice(max(1, size - stacked_frames - 1), batch_size, replace=False)
        indices = indices[:, np.newaxis]
        indices = (indices + np.arange(stacked_frames))

        states = torch.from_numpy(self.buffer[indices]).view(batch_size, -1)
        next_states = torch.from_numpy(self.buffer[indices + 1]).view(batch_size, -1)
        actions = torch.from_numpy(self.actions[indices[:, -1]]).view(batch_size, -1)
        rewards = torch.from_numpy(self.rewards[indices[:, -1]]).view(batch_size, -1)
        dones = torch.from_numpy(self.dones[indices[:, -1]]).view(batch_size, -1)
        valid = torch.from_numpy(1. - np.any(self.dones[indices[:, :-1]], axis=-1)).view(batch_size, -1)

        return states, actions, rewards, next_states, dones, valid

# ==== Dummy Environment ====
class SharedEnv:
    def reset(self):
        return [np.random.randn(4).astype(np.float32) for _ in range(6)]

    def step(self, actions):
        next_obs = [np.random.randn(4).astype(np.float32) for _ in range(6)]
        rewards = [np.random.rand() for _ in range(6)]
        dones = [np.random.rand() > 0.95 for _ in range(6)]
        return next_obs, rewards, dones, {}

# ==== Agent Network ====
class AgentNet(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, output_size=2):
        super(AgentNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        return self.fc2(x)

    def get_flat_weights(self):
        return torch.cat([p.data.view(-1) for p in self.parameters()])

    def load_flat_weights(self, flat_tensor):
        pointer = 0
        for param in self.parameters():
            numel = param.numel()
            param.data.copy_(flat_tensor[pointer:pointer + numel].view_as(param))
            pointer += numel

# ==== Learner Process ====
def learner_process(agent_id, shared_name, shape, lock, shared_epoch, done_epoch, buffer: SharedReplayBuffer):
    shm = shared_memory.SharedMemory(name=shared_name)
    weight_np = np.ndarray(shape, dtype=np.float32, buffer=shm.buf)
    model = AgentNet()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    last_epoch = 0

    while True:
        current_epoch = shared_epoch.value
        if current_epoch > last_epoch:
            try:
                states, actions, rewards, next_states, dones, valid = buffer.sample(batch_size=32)
                output = model(states)
                loss = output.mean()  # Replace with real loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                with lock:
                    weight_np[:] = model.get_flat_weights().numpy()
                print(f"[Learner {agent_id}] Trained for epoch {current_epoch}")

                done_epoch[agent_id] = current_epoch
                last_epoch = current_epoch
            except Exception as e:
                print(f"[Learner {agent_id}] Error during training: {e}")
        time.sleep(0.05)

# ==== Main Actor Loop ====
def actor_loop(num_agents, shared_names, shapes, locks, shared_epoch, done_epoch, buffers):
    models = [AgentNet() for _ in range(num_agents)]
    shms = [shared_memory.SharedMemory(name=shared_names[i]) for i in range(num_agents)]
    weight_nps = [np.ndarray(shapes[i], dtype=np.float32, buffer=shms[i].buf) for i in range(num_agents)]

    env = SharedEnv()
    obs_list = env.reset()
    epoch = 0
    steps_per_epoch = 10
    step_count = 0

    while True:
        actions = []
        for i in range(num_agents):
            with locks[i]:
                weights_tensor = torch.tensor(weight_nps[i].copy(), dtype=torch.float32)
            models[i].load_flat_weights(weights_tensor)
            obs_tensor = torch.tensor(obs_list[i], dtype=torch.float32)
            with torch.no_grad():
                action = models[i](obs_tensor).argmax().item()
            actions.append(action)

        next_obs, rewards, dones, _ = env.step(actions)

        for i in range(num_agents):
            buffers[i].add(obs_list[i], actions[i], rewards[i], float(dones[i]))

        obs_list = next_obs
        step_count += 1

        for i in range(num_agents):
            print(f"[Agent {i}] Action: {actions[i]} Reward: {rewards[i]:.2f} Done: {dones[i]}")

        if step_count >= steps_per_epoch:
            step_count = 0
            epoch += 1
            shared_epoch.value = epoch
            print(f"[Actor] Epoch {epoch} completed. Waiting for learners...")

            while not all(done_epoch[i] == epoch for i in range(num_agents)):
                time.sleep(0.1)
            print(f"[Actor] All learners completed training for epoch {epoch}")

        time.sleep(0.1)

# ==== Setup ====
def main():
    num_agents = 6
    shared_names = []
    shapes = []
    locks = []
    learners = []
    shared_epoch = Value('i', 0)
    done_epoch = mp.Array('i', [0] * num_agents)
    buffers = []

    for i in range(num_agents):
        dummy_model = AgentNet()
        flat_weights = dummy_model.get_flat_weights()
        shape = flat_weights.shape
        shm = shared_memory.SharedMemory(create=True, size=flat_weights.numel() * 4)
        weight_np = np.ndarray(shape, dtype=np.float32, buffer=shm.buf)
        weight_np[:] = flat_weights.numpy()

        buffer = SharedReplayBuffer(capacity=500, obs_shape=(4,), name_prefix=f"rb{i}")
        learner = mp.Process(target=learner_process, args=(i, shm.name, shape, Lock(), shared_epoch, done_epoch, buffer))
        learner.start()

        shared_names.append(shm.name)
        shapes.append(shape)
        locks.append(Lock())
        learners.append(learner)
        buffers.append(buffer)

    actor_loop(num_agents, shared_names, shapes, locks, shared_epoch, done_epoch, buffers)

    for learner in learners:
        learner.join()

if __name__ == "__main__":
    mp.set_start_method("spawn")
    main()


In [1]:
import torch
import torch.nn as nn
import torch.multiprocessing as mp
from multiprocessing import shared_memory, Lock, Value
import numpy as np
import time
from typing import Sequence
import timeit

In [2]:


# Simple model reused for both learners
class AgentNet(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, output_size=2):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

    def get_flat_weights(self):
        return torch.cat([p.data.view(-1) for p in self.parameters()])

    def load_flat_weights(self, flat_tensor):
        pointer = 0
        for param in self.parameters():
            numel = param.numel()
            param.data.copy_(flat_tensor[pointer:pointer + numel].view_as(param))
            pointer += numel

# Dummy buffer for benchmarking
class DummyBuffer:
    def sample(self, batch_size):
        return torch.randn(batch_size, 4), torch.randint(0, 2, (batch_size,)), torch.rand(batch_size)

# Multiprocessing learner
def mp_learner(agent_id, shared_name, shape, lock, shared_epoch, done_epoch):
    shm = shared_memory.SharedMemory(name=shared_name)
    weight_np = np.ndarray(shape, dtype=np.float32, buffer=shm.buf)
    model = AgentNet()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    buffer = DummyBuffer()
    last_epoch = 0

    while True:
        if shared_epoch.value > last_epoch:
            x, y, _ = buffer.sample(32)
            output = model(x)
            loss = nn.CrossEntropyLoss()(output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            with lock:
                weight_np[:] = model.get_flat_weights().numpy()
            done_epoch[agent_id] = shared_epoch.value
            last_epoch = shared_epoch.value
        time.sleep(0.01)

# Sequential training loop (inside main process)
def sequential_learners(agent_models, buffers):
    for model, buffer in zip(agent_models, buffers):
        x, y, _ = buffer.sample(32)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        output = model(x)
        loss = nn.CrossEntropyLoss()(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Benchmarking setup
def main():
    num_agents = 6
    dummy_model = AgentNet()
    flat_weights = dummy_model.get_flat_weights()
    shape = flat_weights.shape

    # === Multiprocessing Setup ===
    shared_epoch = Value('i', 0)
    done_epoch = mp.Array('i', [0] * num_agents)
    locks = []
    learners = []
    shm_names = []
    for i in range(num_agents):
        shm = shared_memory.SharedMemory(create=True, size=flat_weights.numel() * 4)
        shm_np = np.ndarray(shape, dtype=np.float32, buffer=shm.buf)
        shm_np[:] = flat_weights.numpy()
        shm_names.append(shm.name)
        lock = Lock()
        p = mp.Process(target=mp_learner, args=(i, shm.name, shape, lock, shared_epoch, done_epoch))
        p.start()
        learners.append(p)
        locks.append(lock)

    # === Benchmark MP update time ===
    start = time.time()
    for epoch in range(5):
        shared_epoch.value = epoch + 1
        while not all(done_epoch[i] == epoch + 1 for i in range(num_agents)):
            time.sleep(0.01)
    mp_time = time.time() - start

    for p in learners:
        p.terminate()

    # === Sequential Setup ===
    agent_models = [AgentNet() for _ in range(num_agents)]
    buffers = [DummyBuffer() for _ in range(num_agents)]

    # === Benchmark Sequential update time ===
    start = time.time()
    for epoch in range(5):
        sequential_learners(agent_models, buffers)
    seq_time = time.time() - start

    print(f"Multiprocessing time: {mp_time:.4f} s")
    print(f"Sequential time:     {seq_time:.4f} s")

if __name__ == "__main__":
    mp.set_start_method("spawn")
    main()


KeyboardInterrupt: 

In [4]:
# new buffer

import numpy as np
import torch
from multiprocessing import Array, Value, Lock
from ctypes import c_float, c_int, c_int64
from typing import Sequence

class ClaasyReplayBuffer:

    def __init__(self, capacity: int, obs_shape: Sequence[int]):
        self.capacity = capacity
        self.obs_shape = obs_shape
        self.lock = Lock()

        total_obs = int(np.prod((capacity, *obs_shape)))
        self.buffer_raw = Array(c_float, total_obs, lock=False)
        self.actions_raw = Array(c_int64, capacity, lock=False)
        self.rewards_raw = Array(c_float, capacity, lock=False)
        self.dones_raw = Array(c_float, capacity, lock=False)

        self.buffer = np.frombuffer(self.buffer_raw, dtype=np.float32).reshape((capacity, *obs_shape))
        self.actions = np.frombuffer(self.actions_raw, dtype=np.int64)
        self.rewards = np.frombuffer(self.rewards_raw, dtype=np.float32)
        self.dones = np.frombuffer(self.dones_raw, dtype=np.float32)

        self.idx = Value(c_int, 0)
        self.size = Value(c_int, 0)

    def add(self, obs, action, reward, done):
        with self.lock:
            i = self.idx.value
            self.buffer[i] = obs
            self.actions[i] = action
            self.rewards[i] = reward
            self.dones[i] = done
            self.idx.value = (i + 1) % self.capacity
            self.size.value = min(self.size.value + 1, self.capacity)

    def sample(self, batch_size: int, stacked_frames: int = 1):
        with self.lock:
            size = self.size.value

        if size <= stacked_frames + 1:
            raise ValueError("Not enough transitions to sample")

        indices = np.random.choice(size - stacked_frames - 1, batch_size, replace=False)
        indices = indices[:, np.newaxis]
        indices = (indices + np.arange(stacked_frames))

        states = torch.from_numpy(self.buffer[indices]).view(batch_size, -1)
        next_states = torch.from_numpy(self.buffer[indices + 1]).view(batch_size, -1)
        actions = torch.from_numpy(self.actions[indices[:, -1]]).view(batch_size, -1)
        rewards  = torch.from_numpy(self.rewards[indices[:, -1]]).view(batch_size, -1)
        dones = torch.from_numpy(self.dones[indices[:, -1]]).view(batch_size, -1)
        valid = torch.from_numpy(1. - np.any(self.dones[indices[:, :-1]], axis=-1)).view(batch_size, -1)

        return states, actions, rewards, next_states, dones, valid

    def clear(self):
        with self.lock:
            self.buffer.fill(0)
            self.actions.fill(0)
            self.rewards.fill(0)
            self.dones.fill(0)
            self.idx.value = 0
            self.size.value = 0

    def getidx(self):
        return self.idx.value

    def current_state(self, stacked_frames=1):
        i = self.idx.value
        if i < stacked_frames:
            diff = i - stacked_frames
            return np.concatenate((self.buffer[diff % len(self):], self.buffer[:i]))
        return self.buffer[i - stacked_frames:i]

    def __repr__(self):
        return f"Buffer(capacity={self.capacity}, obs_shape={self.obs_shape})"

    def __str__(self):
        return repr(self)

    def __len__(self):
        return self.capacity


In [5]:
import torch
import torch.nn as nn
import time

class AgentNet(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, output_size=2):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

def sequential_main():
    num_agents = 6
    epochs = 5
    obs_shape = (4,)
    capacity = 500
    batch_size = 32

    # Setup
    buffers = [ClaasyReplayBuffer(capacity, obs_shape) for _ in range(num_agents)]
    models = [AgentNet() for _ in range(num_agents)]
    optimizers = [torch.optim.Adam(model.parameters(), lr=0.001) for model in models]

    # Fill buffers with dummy data
    for buffer in buffers:
        for _ in range(capacity):
            obs = torch.randn(obs_shape).numpy()
            action = int(torch.randint(0, 2, (1,)).item())
            reward = float(torch.rand(1).item())
            done = float(torch.rand(1).item() > 0.95)
            buffer.add(obs, action, reward, done)

    # Benchmark
    start_time = time.time()
    for epoch in range(epochs):
        for agent_id in range(num_agents):
            model = models[agent_id]
            optimizer = optimizers[agent_id]
            buffer = buffers[agent_id]

            states, actions, rewards, next_states, dones, valid = buffer.sample(batch_size)
            output = model(states)
            loss = output.mean()  # dummy loss for speed comparison
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    elapsed = time.time() - start_time
    print(f"[Sequential] Training time for {epochs} epochs and {num_agents} agents: {elapsed:.4f} seconds")

if __name__ == "__main__":
    sequential_main()

[Sequential] Training time for 5 epochs and 6 agents: 0.0958 seconds
