In [None]:
import numpy as np
from soccer_env import soccerenv

env = soccerenv(render_mode='human')  # set "human" to visualize
num_episodes = 5

for ep in range(1, num_episodes + 1):
    obs, infos = env.reset(options={"use_full_random_positions": True})
    final_score = {"blue": 0, "red": 0}
    episode_returns = {agent: 0.0 for agent in env.possible_agents}

    while env.agents:  # episode runs until env clears agents
        actions = {agent: env.action_space(agent).sample() for agent in env.agents}
        obs, rewards, terminations, truncations, infos = env.step(actions)

        # Pick one agent's info (all agents share identical info each step)
        any_agent = next(iter(infos))
        step_info = infos[any_agent]
        if "score" in step_info:
            final_score = step_info["score"]
        for agent, r in rewards.items():
            episode_returns[agent] += float(r)
        env.render()

    print(f"Episode {ep} final score: blue={final_score.get('blue',0)}, red={final_score.get('red',0)}")
    print(f"Episode {ep} returns:", episode_returns)
    print("--------------")

# env.close()

In [None]:
env.close()

In [None]:
from soccer_env import make_env

dummy_env = make_env()

env = soccerenv(render_mode=None)
for agent in env.agents:
    obs_space = env.observation_space(agent)
    print(f"Observation space for {agent}: {obs_space}")
env.close()

In [None]:
import os
import random
import time
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter as writer
from soccer_env import make_env

# Imports for PettingZoo and SuperSuit
from soccer_env import soccerenv # Make sure soccer_env.py is in the same directory
from marl_vecenv import SyncMultiAgentVecEnv

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    """Initializes network layers orthogonally."""
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    def __init__(self, envs, rpo_alpha,device='cpu'):
        super().__init__()
        
        # Check if the env object is the vectorized wrapper or a raw env
        if hasattr(envs, 'single_observation_space'):
            # It's the vectorized wrapper from training
            obs_space = envs.single_observation_space
            act_space = envs.single_action_space
        else:
            # It's a raw PettingZoo env from evaluation
            any_agent = envs.possible_agents[0]
            obs_space = envs.observation_space(any_agent)
            act_space = envs.action_space(any_agent)

        obs_shape = np.prod(obs_space.shape)
        act_shape = np.prod(act_space.shape)
        
        self.critic = nn.Sequential(
            layer_init(nn.Linear(obs_shape, 512)),
            nn.Tanh(),
            nn.Linear(512,256),
            nn.Tanh(),
            nn.Linear(256,128),
            nn.Tanh(),
            layer_init(nn.Linear(128, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(obs_shape, 512)),
            nn.Tanh(),
            nn.Linear(512,256),
            nn.Tanh(),
            nn.Linear(256,128),
            nn.Tanh(),
            layer_init(nn.Linear(128, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, act_shape), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.zeros(1, act_shape))
        self.rpo_alpha = rpo_alpha
        self.device=device


    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        else:  # new to RPO
            # sample again to add stochasticity to the policy
            z = torch.FloatTensor(action_mean.shape).uniform_(-self.rpo_alpha, self.rpo_alpha).to(self.device)
            action_mean = action_mean + z
            probs = Normal(action_mean, action_std)
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)

    def get_deterministic_action(self, x):
        action_mean = self.actor_mean(x)
        return action_mean # Return the mean directly

@dataclass
class Config:
    # General parameters
    exp_name: str = "ppo_pettingzoo_soccer"
    seed: int = 19
    torch_deterministic: bool = True
    cuda: bool = True
    
    # Algorithm specific arguments
    total_timesteps: int = int(20e6)
    learning_rate: float = 2e-4
    num_steps: int = 4096 # The number of steps to run in each environment per policy rollout
    anneal_lr: bool = True
    gamma: float = 0.995
    gae_lambda: float = 0.95
    num_minibatches: int = 16
    update_epochs: int = 8
    norm_adv: bool = True
    clip_coef: float = 0.2
    clip_vloss: bool = True
    ent_coef: float = 0.01
    vf_coef: float = 0.7
    max_grad_norm: float = 0.5
    target_kl: float = None
    num_envs: int = 8
    rpo_alpha: float = 0.0

    # These are computed in runtime
    batch_size: int = 0
    minibatch_size: int = 0
    num_iterations: int = 0

    save_model: bool = True
    env_id: str = 'SoccerTwos'

# Instantiate the config
config = Config()


# Create a list of environment constructor functions
env_fns = [make_env for _ in range(config.num_envs)]

# Use the new synchronous wrapper
envs = SyncMultiAgentVecEnv(env_fns)
# --- END REPLACEMENT CODE ---

# Your PPO agent initialization will now work correctly
# The wrapper provides the `single_observation_space` attribute
#agent = Agent(envs)

# The shape of observations will be (num_envs, num_agents, obs_dim)
# which is perfect for batch processing.
observations = envs.reset(seed=config.seed)
print("Shape of observations:", observations.shape)

In [None]:
 envs.single_observation_space.shape

In [None]:
class RunningMeanStd:
    """
    Calculates the running mean and standard deviation of a data stream
    using Welford's online algorithm.
    """
    def __init__(self, shape):
        self.mean = np.zeros(shape, 'float64')
        self.var = np.ones(shape, 'float64')
        self.count = 0

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        batch_count = x.shape[0]
        
        delta = batch_mean - self.mean
        tot_count = self.count + batch_count

        # Update the mean
        self.mean += delta * batch_count / tot_count
        
        # Update the variance
        m_a = self.var * self.count
        m_b = batch_var * batch_count
        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / tot_count
        self.var = M2 / tot_count
        
        # Update the count
        self.count = tot_count

    @property
    def std(self):
        return np.sqrt(self.var)

In [None]:
import torch
import torch.optim as optim
import numpy as np
import time

# Assume these are defined elsewhere:
# from cleanrl_utils.experimental import Agent
# writer = SummaryWriter(f"runs/{run_name}")

import torch
import torch.optim as optim
import numpy as np
import time
from torch.utils.tensorboard import SummaryWriter

def train(args, envs, device='cpu', run_name="run5",saved_normalizer=None,model_path=None):
    # --- SETUP: Define agent configuration and batch sizes ---
    writer = SummaryWriter(f"runs/{run_name}")

    trainable_agent_ids = ["agent_0", "agent_1"]
    possible_agents = envs.possible_agents
    trainable_agent_indices = [possible_agents.index(id) for id in trainable_agent_ids]
    random_agent_indices = [i for i in range(len(possible_agents)) if i not in trainable_agent_indices]
    num_trainable_agents = len(trainable_agent_ids)

    normalizer_path = f"runs/{run_name}/latest_normalizer_stats.npz"

    args.batch_size = int(args.num_envs * args.num_steps * num_trainable_agents)
    args.minibatch_size = int(args.batch_size // args.num_minibatches)
    args.num_iterations = args.total_timesteps // (args.num_envs * args.num_steps)

    agent = Agent(envs,args.rpo_alpha).to(device)
    if model_path:
        agent.load_state_dict(torch.load(model_path, map_location=device))
    optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

    # --- STORAGE: Sized for trainable agents only ---
    obs = torch.zeros((args.num_steps, args.num_envs, num_trainable_agents) + envs.single_observation_space.shape).to(device)
    actions = torch.zeros((args.num_steps, args.num_envs, num_trainable_agents) + envs.single_action_space.shape).to(device)
    logprobs = torch.zeros((args.num_steps, args.num_envs, num_trainable_agents)).to(device)
    rewards = torch.zeros((args.num_steps, args.num_envs, num_trainable_agents)).to(device)
    dones = torch.zeros((args.num_steps, args.num_envs, num_trainable_agents)).to(device)
    values = torch.zeros((args.num_steps, args.num_envs, num_trainable_agents)).to(device)
    trunc_all = np.full((args.num_envs,4),False)

    # --- INITIALIZATION ---
    global_step = 0
    start_time = time.time()
    next_obs_all_agents = torch.Tensor(envs.reset(seed=args.seed)).to(device)
    next_obs = next_obs_all_agents[:, trainable_agent_indices]
    next_done = torch.zeros(args.num_envs, num_trainable_agents).to(device)

    obs_normalizer = RunningMeanStd(shape=envs.single_observation_space.shape)
    if saved_normalizer:
        stats = np.load(saved_normalizer)
        obs.mean = stats['mean']
        obs.std = np.sqrt(stats['var'])

    game_rewards=torch.zeros_like(rewards[0])
    for iteration in range(1, args.num_iterations + 1):
        if args.anneal_lr:
            frac = 1.0 - (iteration - 1.0) / args.num_iterations
            lrnow = frac * args.learning_rate
            optimizer.param_groups[0]["lr"] = max(lrnow,3e-5)

        # --- ROLLOUT PHASE ---
        score = [{"blue": 0, "red": 0} for i in range(args.num_envs)]
        games=0
        rw = torch.zeros(num_trainable_agents)
        for step in range(0, args.num_steps):
            global_step += args.num_envs
            obs[step] = next_obs
            dones[step] = next_done

            # Action selection and reshaping
            with torch.no_grad():
                agent_input = next_obs.reshape(-1, *envs.single_observation_space.shape)
                normalized_input = np.clip((agent_input.cpu().numpy() - obs_normalizer.mean) / (obs_normalizer.std + 1e-8), -10, 10)
                action, logprob, _, value = agent.get_action_and_value(torch.tensor(normalized_input,dtype=torch.float))
                values[step] = value.reshape(args.num_envs, num_trainable_agents)
                actions[step] = action.reshape(args.num_envs, num_trainable_agents, *envs.single_action_space.shape)
                logprobs[step] = logprob.reshape(args.num_envs, num_trainable_agents)

            # if step == 0 and iteration%5 == 0:
            #     print("Sample observation vector for one agent:")
            #     # next_obs has shape (num_envs, num_trainable_agents, obs_dim)
            #     print(next_obs[0, 0, :].cpu().numpy()) 

            # Combine policy actions with random actions
            full_actions = np.zeros((args.num_envs, len(possible_agents), *envs.single_action_space.shape), dtype=np.float32)
            full_actions[:, trainable_agent_indices] = actions[step].cpu().numpy()
            random_actions = np.random.uniform(-1.0, 1.0, size=(args.num_envs, len(random_agent_indices), *envs.single_action_space.shape))
            full_actions[:, random_agent_indices] = random_actions

            # Environment step
            next_obs_all, reward_all, term_all, trunc_all, infos = envs.step(full_actions)
            
            # Data selection for trainable agents
            reward_trainable = reward_all[:, trainable_agent_indices]
            next_obs = torch.Tensor(next_obs_all[:, trainable_agent_indices]).to(device)
            next_done = torch.Tensor(np.logical_or(term_all[:, trainable_agent_indices], trunc_all[:, trainable_agent_indices])).to(device)
            rewards[step] = torch.tensor(reward_trainable).to(device)

            game_rewards+=rewards[step]

            if np.any(trunc_all):
                games+=1
                rw+=game_rewards.mean(dim=0)
                game_rewards=torch.zeros_like(rewards[0])
                any_agent = next(iter(infos[0]))
                for ind,info in enumerate(infos):
                    score[ind]['blue']+=info[any_agent]['score']['blue']
                    score[ind]['red']+=info[any_agent]['score']['red']


        blue_avg = sum([s.get('blue',0) for s in score])/args.num_envs
        red_avg = sum([s.get('red',0) for s in score])/args.num_envs
        #writer.add_scalar("charts/blue_score", blue_avg, global_step)
        #writer.add_scalar("charts/red_score", red_avg, global_step)
        #rw_sum = rewards.sum(dim=(0, 1))
        writer.add_scalar("charts/avg_agent0_return", rw[0]/games, global_step)
        writer.add_scalar("charts/avg_agent1_return", rw[1]/games, global_step)

        obs_normalizer.update(obs.cpu().numpy().reshape(-1, *envs.single_observation_space.shape))

        # --- GAE AND VALUE BOOTSTRAPPING ---
        with torch.no_grad():
            mean_t = torch.tensor(obs_normalizer.mean, dtype=torch.float32, device=device)
            std_t = torch.tensor(obs_normalizer.std, dtype=torch.float32, device=device)

            writer.add_scalar("stats/running_mean", mean_t.mean(), global_step)
            writer.add_scalar("stats/running_std", std_t.mean(), global_step)

            normalized_obs = (obs - mean_t) / \
                     (std_t + 1e-8)
            normalized_obs = torch.clamp(normalized_obs, -10, 10)

            norm_next_obs = (next_obs - mean_t) / (std_t + 1e-8)
            norm_next_obs = torch.clamp(norm_next_obs, -10, 10)
            
            agent_input = norm_next_obs.reshape(-1, *envs.single_observation_space.shape)
            
            #next_value = agent.get_value(agent_input).reshape(1, args.num_envs, num_trainable_agents)
            next_value = agent.get_value(agent_input).reshape(args.num_envs, num_trainable_agents)
            advantages = torch.zeros_like(rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(args.num_steps)):
                if t == args.num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    nextvalues = next_value.squeeze(0)
                else:
                    nextnonterminal = 1.0 - dones[t + 1]
                    nextvalues = values[t + 1]
                delta = rewards[t] + args.gamma * nextvalues - values[t]
                #delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + values

        # --- BATCH FLATTENING ---
        b_obs = normalized_obs.reshape((-1,) + envs.single_observation_space.shape)
        b_logprobs = logprobs.reshape(-1)
        b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
        b_advantages = advantages.reshape(-1)
        b_returns = returns.reshape(-1)
        b_values = values.reshape(-1)

        # --- PPO UPDATE PHASE ---
        b_inds = np.arange(b_obs.shape[0])
        clipfracs = []
        for epoch in range(args.update_epochs):
            np.random.shuffle(b_inds)
            for start in range(0, b_obs.shape[0], args.minibatch_size):
                end = start + args.minibatch_size
                mb_inds = b_inds[start:end]

                _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
                logratio = newlogprob - b_logprobs[mb_inds]
                ratio = logratio.exp()

                with torch.no_grad():
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]

                mb_advantages = b_advantages[mb_inds]
                if args.norm_adv:
                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

                pg_loss1 = -mb_advantages * ratio
                pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                mb_returns = b_returns[mb_inds]
                norm_mb_returns = (mb_returns - b_returns.mean()) / (b_returns.std() + 1e-8)

                newvalue = newvalue.view(-1)
                if args.clip_vloss:
                    v_loss_unclipped = (newvalue - norm_mb_returns) ** 2
                    v_clipped = b_values[mb_inds] + torch.clamp(
                        newvalue - b_values[mb_inds], -args.clip_coef, args.clip_coef)
                    v_loss_clipped = (v_clipped - norm_mb_returns) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()
                else:
                    v_loss = 0.5 * ((newvalue - norm_mb_returns) ** 2).mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                optimizer.step()

        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # --- LOGGING TRAINING LOSSES ---
        writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
        writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
        writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
        writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
        writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
        writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
        writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
        writer.add_scalar("losses/explained_variance", explained_var, global_step)
        #writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
        if args.save_model:
            model_path = f"runs/{run_name}/{args.exp_name}.ppo_model"
            torch.save(agent.state_dict(), model_path)
            np.savez(
                normalizer_path,
                mean=obs_normalizer.mean,
                var=obs_normalizer.var
            )


    envs.close()
    writer.close()

# import sys

# module_directory_path = os.path.abspath('cleanrl')
# if module_directory_path not in sys.path:
#     sys.path.append(module_directory_path)

saved_normalizer = "runs/run4/latest_normalizer_stats.npz"
model_path = "runs/run4/ppo_pettingzoo_soccer.ppo_model"

train(config,envs,saved_normalizer=saved_normalizer,model_path=model_path)

In [None]:
torch.save(agent.state_dict(), model_path)

In [None]:
import torch
import numpy as np
from soccer_env import soccerenv
import time
# Make sure your Agent class is imported or defined in this file
#from your_training_script import Agent, layer_init 

# --- 1. SETUP AND LOAD THE MODEL ---

# The device to run on (e.g., "cpu" or "cuda")
device = torch.device("cpu") 

# Create a dummy environment to get observation and action space info
# This is needed to initialize the Agent class with the correct network shapes
dummy_env = soccerenv(render_mode="human")
agent = Agent(dummy_env).to(device)
dummy_env.close()

# Load the saved weights from your training run
model_path = f"runs/run3/{config.exp_name}.ppo_model" # <--- CHANGE THIS
agent.load_state_dict(torch.load(model_path, map_location=device))
agent.eval()  # Set the agent to evaluation mode

# --- 2. RUN THE EVALUATION LOOP ---

env = soccerenv(render_mode="human") # Set "human" to visualize, "None" to run faster
num_episodes = 10

trainable_agent_ids = ["agent_0", "agent_1"]
possible_agents = env.possible_agents
trainable_agent_indices = [possible_agents.index(a) for a in trainable_agent_ids]
red_agent_ids = [a for a in possible_agents if a not in trainable_agent_ids]

act_shape = env.action_space(possible_agents[0]).shape

for ep in range(num_episodes):
    obs, infos = env.reset()
    final_score = {"blue": 0, "red": 0}
    ep_ret = {agent: 0.0 for agent in env.possible_agents}

    while env.agents:
        # Build observation batch in the same agent order
        obs_tensor = torch.tensor(
            np.stack([obs[a] for a in possible_agents]), dtype=torch.float32, device=device
        )

        with torch.no_grad():
            raw_action = agent.get_deterministic_action(obs_tensor)
            norm_action = raw_action.cpu().numpy()  # shape (4,3) if model outputs for all
            #print(norm_action)
            # If your model only outputs for blue agents, slice accordingly:
            # norm_action = torch.tanh(raw_action).cpu().numpy()  # shape (2,3)
        
        actions = {}
        for idx, agent_id in enumerate(possible_agents):
            if agent_id in trainable_agent_ids:
                # If model outputs only for blue agents, map by index:
                src = trainable_agent_indices.index(idx) if norm_action.shape[0] == len(trainable_agent_ids) else idx
                actions[agent_id] = norm_action[src].astype(np.float32)
            else:
                actions[agent_id] = np.random.uniform(-1.0, 1.0, size=act_shape).astype(np.float32)

        obs, rewards, terminations, truncations, infos = env.step(actions)
        env.render()
        for a, r in rewards.items():
            ep_ret[a] += float(r)
        env.render()
        time.sleep(1/60)

        any_agent = next(iter(infos))
        if "score" in infos[any_agent]:
            final_score = infos[any_agent]["score"]
            
    blue_sum = ep_ret.get("agent_0", 0.0) + ep_ret.get("agent_1", 0.0)
    #print(f"Episode {ep}: per-agent={ep_ret}, blue_sum={blue_sum:.4f}")
    #print(f"Episode {ep+1} final score: blue={final_score.get('blue',0)}, red={final_score.get('red',0)}")

env.close()