In [27]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from collections import deque
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
import random

In [28]:
class MultiAgentMazeEnv:
    def __init__(self, size=(5, 5), starts=[(0, 0), (0, 4)], goals=[(5, 5), (5, 5)], walls=None):
        self.size = size
        self.starts = starts
        self.goals = goals
        self.walls = walls if walls else []
        self.n_agents = len(starts)
        self.reset()

    def reset(self):
        self.agent_positions = [list(start) for start in self.starts]
        return [tuple(pos) for pos in self.agent_positions]

    def step(self, actions, weights=None):
        """
        actions: list of integers (0=up, 1=down, 2=left, 3=right)
        returns: new positions, rewards, dones
        """
        moves = {
            0: (-1, 0),  # up
            1: (1, 0),   # down
            2: (0, -1),  # left
            3: (0, 1),   # right
        }

        n = len(actions)
        order = list(range(n))  # default order
        
        if weights is None:
            weights = np.ones(n, dtype=float)  # equal weights
        else:
            weights = np.array(weights, dtype=float)
            
    
        if weights is not None:
            # --- Weighted random order (no replacement) ---
            order = []
            remaining_agents = list(range(n))
            remaining_weights = np.array(weights, dtype=float).copy()
    
            while remaining_agents:
                probs = remaining_weights / remaining_weights.sum()
                idx = np.random.choice(len(remaining_agents), p=probs)
                order.append(remaining_agents.pop(idx))
                remaining_weights = np.delete(remaining_weights, idx)

        new_positions = []
        for i in range(0,n): 
            new_positions.append(tuple(map(int,[-1,-1])))

        agent_pos_temp = np.array(self.agent_positions, dtype=float).copy()
        for i in order:   # process agents in weighted-random order
            action = actions[i]
            move = moves[action]
            new_pos = [agent_pos_temp[i][0] + move[0],
                       agent_pos_temp[i][1] + move[1]]

            # check bounds and walls
            if (0 <= new_pos[0] < self.size[0] and 0 <= new_pos[1] < self.size[1] and tuple(new_pos) not in self.walls):
                if tuple(new_pos) not in new_positions:
                    agent_pos_temp[i] = new_pos  # update if valid
                    new_positions[i] = tuple(map(int,agent_pos_temp[i]))
                elif tuple(new_pos) in new_positions:
                    new_positions[i] = tuple(map(int,self.agent_positions[i]))
            else:
                if tuple(agent_pos_temp[i]) in new_positions:
                    new_positions = list(self.agent_positions)
                    break
                else:
                    new_positions[i] = tuple(map(int,self.agent_positions[i]))
        self.agent_positions = new_positions
        
        rewards = np.zeros(n, dtype=float)
        dones   = np.zeros(n, dtype=bool)  
        # who reached their goal this step?
        reached = [i for i in range(n) if tuple(new_positions[i]) == self.goals[i]]
        
        if len(reached) == 1:
            w = reached[0]
            rewards[w] = 1.0
            for j in range(n):
                if j != w:
                    rewards[j] = -1.0
            dones[:] = True  # end episode after a win
        elif len(reached) >= 2:
            # simultaneous arrival: tie â†’ zeros, end episode
            rewards[:] = 0.0
            dones[:] = True
        else:
            # no one reached: keep going
            rewards[:] = 0.0
            dones[:] = False
        
        return new_positions, rewards, dones

    def render(self):
        grid = np.full(self.size, " ", dtype=object)
    
        # Draw walls
        for w in self.walls:
            grid[w] = "#"
    
        # Draw numbered goals (colored numbers)
        colors = ["\033[92m", "\033[94m", "\033[91m", "\033[93m"]  # green, blue, red, yellow
        for i, g in enumerate(self.goals):
            color = colors[i % len(colors)]
            grid[g] = f"{color}{i}\033[0m"
    
        # Draw numbered agents
        for i, pos in enumerate(self.agent_positions):
            grid[tuple(pos)] = f"{i}"
    
        # Print
        for row in grid:
            print(" ".join(row))
        print("----------")



def random_maze_env(size=(5,5), n_agents=1, n_walls=6, seed=None):
    """
    Build a random MultiAgentMazeEnv with given size, agents, and wall count.
    
    Args:
        size (tuple): grid size (rows, cols)
        n_agents (int): number of agents
        n_walls (int): number of wall cells
        seed (int or None): random seed for reproducibility
    
    Returns:
        env: a MultiAgentMazeEnv instance
    """
    if seed is not None:
        random.seed(seed)
    else:
        seed = random.randint(0, int(1e9))
        random.seed(seed)
        print("Generated seed:", seed)

    rows, cols = size

    # --- choose starts and goals ---
    all_cells = [(r, c) for r in range(rows) for c in range(cols)]
    starts = random.sample(all_cells, n_agents)
    remaining = [c for c in all_cells if c not in starts]
    goals = random.sample(remaining, n_agents)

    # --- choose walls (avoid starts + goals) ---
    forbidden = set(starts + goals)
    candidates = [c for c in all_cells if c not in forbidden]
    walls = random.sample(candidates, min(n_walls, len(candidates)))

    # --- build env ---
    env = MultiAgentMazeEnv(
        size=size,
        starts=starts,
        goals=goals,
        walls=walls
    )
    return env

In [50]:
def solution(env, scores, policies):
    """
    Combined solution that works with both LinearPolicy and neural network policies
    """
    
    # Optional: Plot training progress
    # plot_training_progress(scores)
    
    # --- Run one greedy rollout ---
    dones = [False] * env.n_agents
    states = env.reset()
    env.render()
    
    max_steps = 20
    
    for step in range(max_steps):
        actions = []
        for i in range(env.n_agents):
            # Detect policy type and get greedy action accordingly
            if hasattr(policies[i], '_phi'):
                # LinearPolicy case - use _phi method
                phi = policies[i]._phi(states[i])
                logits = phi @ policies[i].W
                probs = torch.softmax(logits, dim=0)
                action = torch.argmax(probs, dim=0).item()
            else:
                # Neural network policy case - call directly
                state_tensor = torch.tensor(states[i], dtype=torch.float32).unsqueeze(0)
                probs = policies[i](state_tensor)
                action = torch.argmax(probs, dim=1).item()
            
            actions.append(action)

        next_states, step_rewards, dones = env.step(actions)
        states = next_states
        env.render()
        if any(dones):
            break

In [51]:
class PolicyNet(nn.Module): #Â definie the policy network
    def __init__(self, state_size=2, action_size=4, hidden_size=32):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)
        
        # force initialization for equal action probs
        nn.init.zeros_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        return F.softmax(x, dim=1) # we just consider 1 dimensional probability of action

    def act(self, state):
        state = np.array(state, dtype=np.float32)
        state = torch.from_numpy(state).float().unsqueeze(0).to("cpu")
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action), model.entropy()

class LinearPolicy:
    def __init__(self, state_size=2, action_size=4, maze_size=None):
        self.action_size = action_size
        self.maze_size = np.array(maze_size if maze_size is not None else [1,1], dtype=np.float32)
        self.W = torch.zeros(state_size + 1, action_size, requires_grad=True)

    def _phi(self, state):
        s = np.array(state, dtype=np.float32) / self.maze_size
        phi = np.append(s, 1.0)
        return torch.from_numpy(phi).float()

    def act(self, state):
        phi = self._phi(state)
        logits = phi @ self.W
        probs = torch.softmax(logits, dim=0)
        dist  = Categorical(probs)
        a     = dist.sample()
        return a.item(), dist.log_prob(a), dist.entropy()
        #return a.item(), dist.log_prob(a)
    def parameters(self):
        return [self.W]


class LinearPolicy:
    def __init__(self, state_size=2, action_size=4, maze_size=None, epsilon=0.1, epsilon_decay=0.995, min_epsilon=0.1):
        self.action_size = action_size
        self.maze_size = np.array(maze_size if maze_size is not None else [1,1], dtype=np.float32)
        self.W = torch.zeros(state_size + 1, action_size, requires_grad=True)
        
        # Epsilon-greedy parameters
        self.epsilon = epsilon
        self.initial_epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def _phi(self, state):
        s = np.array(state, dtype=np.float32) / self.maze_size
        phi = np.append(s, 1.0)
        return torch.from_numpy(phi).float()

    def act(self, state, training=True):
        phi = self._phi(state)
        logits = phi @ self.W
        probs = torch.softmax(logits, dim=0)
        dist = Categorical(probs)
        
        if training and np.random.random() < self.epsilon:
            # Exploration: create a uniform distribution for exploration
            explore_probs = torch.ones(self.action_size) / self.action_size
            explore_dist = Categorical(explore_probs)
            a = explore_dist.sample()
        else:
            # Exploitation: sample from policy
            a = dist.sample()
        
        # Always get log_prob from the original policy distribution for proper gradients
        log_prob = dist.log_prob(a)
        entropy = dist.entropy()
        
        return a.item(), log_prob, entropy
    
    def decay_epsilon(self):
        """Decay epsilon after each episode"""
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)
    
    def reset_epsilon(self):
        """Reset epsilon to initial value"""
        self.epsilon = self.initial_epsilon

    def parameters(self):
        return [self.W]
        

In [53]:
def check_convergence(agent_id, policies, old_params, policy_change_history, 
                     reward_history, scores, policy_thresh, reward_thresh, window):
    """Check multiple convergence criteria with detailed monitoring"""
    
    
    # 1. Policy parameter change
    current_params = [p.data for p in policies[agent_id].parameters()]
    param_change = compute_parameter_change(old_params[agent_id], current_params)
    policy_change_history[agent_id].append(param_change)
    
    #print(f"  Policy parameter change: {param_change:.6f} (threshold: {policy_thresh})")
    
    # 2. Reward stability
    reward_std = None
    if len(scores) >= window:
        recent_rewards = [s[agent_id] for s in scores[-window:]]
        reward_std = np.std(recent_rewards)
        reward_history[agent_id].append(reward_std)
        
        recent_avg = np.mean(recent_rewards)
    
    # Check convergence conditions
    converged = False
    policy_converged = False
    reward_converged = False
    
    # Policy hasn't changed significantly
    if len(policy_change_history[agent_id]) >= 3:
        recent_policy_changes = policy_change_history[agent_id][-3:]
        policy_converged = all(change < policy_thresh for change in recent_policy_changes)
        
        if policy_converged:
            #print(f"  âœ“ Policy parameters stabilized (change < {policy_thresh})")
            converged = True
        else:
            max_policy_change = max(recent_policy_changes)
            #print(f" Policy still changing (max change: {max_policy_change:.6f} >= {policy_thresh})")

    
    # Rewards are stable
    #if len(reward_history[agent_id]) >= 3 and reward_std is not None:
    #    recent_reward_stds = reward_history[agent_id][-3:]
    #    reward_converged = all(std < reward_thresh for std in recent_reward_stds)
        
    #    if reward_converged:
    #        print(f"  âœ“ Rewards stabilized (std < {reward_thresh})")
    #        converged = True
    #    else:
    #        max_reward_std = max(recent_reward_stds)
    #        print(f"  Rewards still fluctuating (max std: {max_reward_std:.6f} >= {reward_thresh})")

    
    # Convergence summary
    #if converged:
        #print(f" AGENT {agent_id} CONVERGED!") 
        #if policy_converged and reward_converged:
        #    print(f"  Both policy and reward criteria satisfied")
        #elif policy_converged:
        #    print(f"  Policy criteria satisfied")
        #elif reward_converged:
        #    print(f"  Reward criteria satisfied") 
    #print(f"  {'='*50}")
    
    return converged


def check_round_convergence(agent_id, round_policy_changes, 
                           policy_thresh):
    """
    Check BETWEEN-ROUND convergence - if an agent's policy has stabilized between rounds
    """
    # Need at least 2 rounds of data to check convergence
    if len(round_policy_changes[agent_id]) < 2:
        return False
    
    # Get recent policy changes between rounds
    recent_policy_changes = round_policy_changes[agent_id][-2:]  # Last 2 round comparisons
    
    # Check if policy changes are below threshold
    policy_stable = all(change < policy_thresh for change in recent_policy_changes)
    
    return policy_stable

In [60]:
def reinforce_multi_rwd2go_alt(env, policies, optimizers, n_episodes=100, max_t=20, gamma=0.9, batch_size=500, print_every=10):
    
    # Add convergence parameters
    conv_window = 10           # episodes to check for convergence
    window = conv_window
    policy_change_threshold = 0.001
    reward_change_threshold = 0.001
    min_episodes = 10        # minimum episodes before checking convergence
    max_rounds = 20
    entropy_coef = 0.01
    
    scores_deque = deque(maxlen=window)
    scores = []
    
    # For WITHIN-ROUND convergence tracking
    old_policy_params = [None] * env.n_agents
    policy_change_history = [[] for _ in range(env.n_agents)]
    reward_history = [[] for _ in range(env.n_agents)]

    # For BETWEEN-ROUND convergence tracking  
    round_policy_params = [None] * env.n_agents
    round_policy_changes = [[] for _ in range(env.n_agents)]
    round_reward_history = [[] for _ in range(env.n_agents)]
    
    round_count = 0
    all_agents_stable = False
    
    while round_count < max_rounds and not all_agents_stable:   
        round_count += 1
        print(f"\n{'='*50}")
        print(f"=== Round {round_count} ===")
        print(f"{'='*50}")

        # Store policies at the START of this round for between-round comparison
        if round_count == 1:
            # First round - initialize with current policies
            round_policy_params = [
                [p.data.clone() for p in policies[i].parameters()]
                for i in range(env.n_agents)
            ]
        else:
            # For subsequent rounds, we'll compare with previous round's policies
            previous_round_policies = round_policy_params.copy()
        
        # Alternate through each agent
        for phase in range(env.n_agents):
            print(f"\n--- Training Agent {phase} ---")
            
            episode_count = 0
            agent_converged = False  # WITHIN-ROUND convergence flag
            
            while episode_count < n_episodes and not agent_converged:
                episode_count += 1
                
                # Store old policy parameters BEFORE update
                if old_policy_params[phase] is None:
                    old_policy_params[phase] = [
                        p.data.clone() for p in policies[phase].parameters()
                    ]
                
                # Accumulate batch loss for current agent
                batch_loss = 0.0
                batch_count = 0
                batch_value_logs = defaultdict(list)
                batch_rewards = []
                all_entropies = []
                
                # ---- collect batch of episodes ----
                for _ in range(batch_size):
                    states = env.reset()
            
                    saved_log_probs = [[] for _ in range(env.n_agents)]
                    rewards = [[] for _ in range(env.n_agents)]
                    saved_entropies = [[] for _ in range(env.n_agents)]
                    epi_states = [[] for _ in range(env.n_agents)]
                    dones = [False] * env.n_agents
            
                    # episode
                    for t in range(max_t):
                        actions, log_probs = [], []
                        for i in range(env.n_agents):
                            a, lp, entropy = policies[i].act(states[i])
                            actions.append(a)
                            log_probs.append(lp)
                            epi_states[i].append(states[i])
            
                        next_states, step_rewards, dones = env.step(actions)
                        for i in range(env.n_agents):
                            saved_log_probs[i].append(log_probs[i])
                            rewards[i].append(step_rewards[i])
                            saved_entropies[i].append(entropy)  # Save per step
            
                        states = next_states
                        if any(dones):
                            for i in range(env.n_agents):
                                epi_states[i].append(states[i])
                            break
        
                    # --- Process current agent's trajectory ---
                    if len(rewards[phase]) > 0:
                        # rewards-to-go (Monte Carlo return)
                        discounts = [gamma**k for k in range(len(rewards[phase]) + 1)]
                        rewards_to_go = [
                            sum(discounts[j] * rewards[phase][j+t] for j in range(len(rewards[phase]) - t))
                            for t in range(len(rewards[phase]))
                        ]
                        rewards_to_go = torch.tensor(rewards_to_go, dtype=torch.float32)
        
        
                        # --- policy loss for current agent ---
                        pol_terms = []
                        for lp, G in zip(saved_log_probs[phase], rewards_to_go):
                            if isinstance(lp, torch.Tensor):
                                pol_terms.append(-lp * G)
        
                        if pol_terms:
                            ep_loss = torch.stack(pol_terms).sum()
                            batch_loss += ep_loss
                            batch_count += 1
        
                    # logging rewards per episode
                    episode_rewards = [sum(r) for r in rewards]
                    batch_rewards.append(episode_rewards)
                
                all_entropies = [ent for ep_ents in saved_entropies[phase] for ent in ep_ents]
                mean_entropy = torch.stack(all_entropies).mean() if all_entropies else torch.tensor(0.0)
                # Update current agent's policy
                if batch_count > 0:
                    #loss = (batch_loss / batch_count) - entropy_coef * mean_entropy
                    loss = batch_loss / batch_count
                    optimizers[phase].zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(policies[phase].parameters(), max_norm=1.0)
                    optimizers[phase].step()

                # After each episode, decay exploration
                #policies[phase].decay_epsilon()
                
                # Update scores
                avg_batch_rewards = np.mean(batch_rewards, axis=0)
                scores_deque.append(avg_batch_rewards)
                scores.append(avg_batch_rewards)
                
                # After policy update, check convergence
                if episode_count >= min_episodes:
                    agent_converged = check_convergence(
                        phase, policies, old_policy_params, 
                        policy_change_history, reward_history,
                        scores, policy_change_threshold, reward_change_threshold,
                        conv_window
                    )
                    
                    # Update old parameters for next comparison
                    old_policy_params[phase] = [
                        p.data.clone() for p in policies[phase].parameters()
                    ]
                
                if agent_converged:
                    print(f"âœ“ Agent {phase} converged after {episode_count} episodes")
                    break

                # ---- print progress ----
                if episode_count % print_every == 0:
                    avg_rewards = np.mean(scores_deque, axis=0) if len(scores_deque) > 0 else [0]*env.n_agents
                    msg = f" Agent{phase} Episode {episode_count}"
                    msg += f" avgR={avg_rewards[phase]:.4f}"
                    print(msg)
                    print(" ")



        # BETWEEN-ROUND convergence check (after all agents complete this round)
        solution(env, scores, policies) 
        
        print(f"\n--- Round {round_count} Policy Comparison ---")
        
        if round_count > 1:  # We can only compare from round 2 onwards
            round_stable_agents = []
            
            for agent_id in range(env.n_agents):
                print(f"\n  Comparing Agent {agent_id} policies (Round {round_count-1} vs Round {round_count}):")
                
                # Get current policy parameters
                current_params = [p.data for p in policies[agent_id].parameters()]
                previous_params = previous_round_policies[agent_id]
                
                # Compute policy change between rounds
                policy_change = compute_parameter_change(previous_params, current_params)
                round_policy_changes[agent_id].append(policy_change)
                print(f"    Policy parameter change: {policy_change:.6f} (threshold: {policy_change_threshold})")
                
                # Check if this agent has stabilized between rounds
                agent_stable = check_round_convergence(agent_id, round_policy_changes, policy_change_threshold)
                
                if agent_stable:
                    round_stable_agents.append(agent_id)
                    print(f"  Agent {agent_id} policies stabilized between rounds")
                else:
                    print(f"  Agent {agent_id} policies still changing between rounds")

            # Check if all agents have stabilized between rounds
            if len(round_stable_agents) == env.n_agents:
                print(f"\n ALL AGENTS STABILIZED BETWEEN ROUNDS AFTER ROUND {round_count}!") 
                all_agents_stable = True
            else:
                stable_count = len(round_stable_agents)
                print(f"\n {stable_count}/{env.n_agents} agents stabilized between rounds")

        # Store current policies for next round comparison
        round_policy_params = [
            [p.data.clone() for p in policies[i].parameters()]
            for i in range(env.n_agents)
        ]
        
        print(f"{'='*50}")

    
    return scores


def compute_parameter_change(old_params, new_params):
    """Compute average relative parameter change"""
    total_change = 0.0
    total_params = 0
    
    for old, new in zip(old_params, new_params):
        if old.shape == new.shape:
            change = torch.norm(new - old) / (torch.norm(old) + 1e-8)
            total_change += change.item()
            total_params += 1
    
    return total_change / total_params if total_params > 0 else 0.0

In [29]:
def reinforce_multi_rwd2go_alt_2(
    env, policies, optimizers,
    n_episodes=1000,        
    max_t=20,                # max steps per rollout
    gamma=0.9,
    batch_size=1000,         # episodes per batch update
    print_every=1,           # print every X rollout
):
    
    scores_deque = deque(maxlen=window)
    scores = []


    round_count = 0
    
    while round_count < max_rounds:
        round_count += 1
        print(f"\n{'='*50}")
        print(f"=== Round {round_count} ===")
        print(f"{'='*50}")
        
        # Alternate through each agent
        for phase in range(env.n_agents):
            print(f"\n--- Training Agent {phase} ---")
            
            episode_count = 0
            
            while episode_count < n_episodes:
                episode_count += 1
                
                # Accumulate batch loss for current agent
                batch_loss = 0.0
                batch_count = 0
                batch_value_logs = defaultdict(list)
                batch_rewards = []
                
                # ---- collect batch of episodes ----
                for _ in range(batch_size):
                    states = env.reset()
            
                    saved_log_probs = [[] for _ in range(env.n_agents)]
                    rewards = [[] for _ in range(env.n_agents)]
                    epi_states = [[] for _ in range(env.n_agents)]
                    dones = [False] * env.n_agents
            
                    # episode
                    for t in range(max_t):
                        actions, log_probs = [], []
                        for i in range(env.n_agents):
                            a, lp, _ = policies[i].act(states[i])
                            actions.append(a)
                            log_probs.append(lp)
                            epi_states[i].append(states[i])
            
                        next_states, step_rewards, dones = env.step(actions)
                        for i in range(env.n_agents):
                            saved_log_probs[i].append(log_probs[i])
                            rewards[i].append(step_rewards[i])
            
                        states = next_states
                        if any(dones):
                            for i in range(env.n_agents):
                                epi_states[i].append(states[i])
                            break
        
                    # --- Process current agent's trajectory ---
                    if len(rewards[phase]) > 0:
                        # rewards-to-go (Monte Carlo return)
                        discounts = [gamma**k for k in range(len(rewards[phase]) + 1)]
                        rewards_to_go = [
                            sum(discounts[j] * rewards[phase][j+t] for j in range(len(rewards[phase]) - t))
                            for t in range(len(rewards[phase]))
                        ]
                        rewards_to_go = torch.tensor(rewards_to_go, dtype=torch.float32)
        
        
                        # --- policy loss for current agent ---
                        pol_terms = []
                        for lp, G in zip(saved_log_probs[phase], rewards_to_go):
                            if isinstance(lp, torch.Tensor):
                                pol_terms.append(-lp * G)
        
                        if pol_terms:
                            ep_loss = torch.stack(pol_terms).sum()
                            batch_loss += ep_loss
                            batch_count += 1
        
                    # logging rewards per episode
                    episode_rewards = [sum(r) for r in rewards]
                    batch_rewards.append(episode_rewards)
                
                # Update current agent's policy
                if batch_count > 0:
                    loss = batch_loss / batch_count
                    optimizers[phase].zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(policies[phase].parameters(), max_norm=1.0)
                    optimizers[phase].step()
                
                # Update scores
                avg_batch_rewards = np.mean(batch_rewards, axis=0)
                scores_deque.append(avg_batch_rewards)
                scores.append(avg_batch_rewards)
                
                # ---- print progress ----
                if episode_count % print_every == 0:
                    avg_rewards = np.mean(scores_deque, axis=0) if len(scores_deque) > 0 else [0]*env.n_agents
                    msg = f" Agent{phase} Episode {episode_count}"
                    msg += f" avgR={avg_rewards[phase]:.4f}"
                    print(msg)
                    
                    s_key = f"Agent{phase}_start"
                    g_key = f"Agent{phase}_goal"
                    s_val = value_logs[s_key][-1] if value_logs[s_key] else None
                    g_val = value_logs[g_key][-1] if value_logs[g_key] else None
                    status = "âœ“" if agent_converged else "âœ—"
                    print(f"   Agent{phase}{status}: V(start)={s_val:.4f} V(goal)={g_val:.4f}")
                    print(" ")
    
    return scores

In [None]:
# Initialize env with 2 agents

#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=554669014)   (1) 0.001 15k yes  | (2) 0.01 20k yes  30k yes  | (2) 0.005 20k yes  30k yes | (3) 0.001 15k yes
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=403475465)   (1) 0.001 15k no   | (2) 0.01 20k yes  30k yes  | (2) 0.005 20k yes  30k yes | (3) 0.0001 15k no 
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=972745715)   (1) 0.001 15k yes  | (2) 0.01 20k no  30k yes   | (2) 0.005 20k no  30k yes  | (3) 0.0001 15k no
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=302632242)   (1) -     15k no   | (2) 0.01 20k yes  30k no   | (2) 0.005 20k yes  30k no  | (3) 0.0001 15k no
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=219122341)   (1) 0.001 15k yes  | - | (2) 0.005 20k yes  30k yes
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=871220494)   (1) 0.001 15k no   | - | -
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6,seed=47399376)     (1) 0.001 15k yes  | -  | -
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=83129399)    (1) 0.001 15k yes  | -  | (2) 0.005 20k yes  30k yes
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=938621412)   (1) 0.001 15k yes  | -  | -
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=86246299)    (1) 0.001 15k yes  | -  | -
#env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=163628333)
env = random_maze_env(size=(5,5), n_agents=2, n_walls=6, seed=584698030)
env.render()

# Two policies + optimizers
policies = [PolicyNet(state_size=2) for _ in range(env.n_agents)]
optimizers = [optim.Adam(p.parameters(), lr=0.01) for p in policies]

scores = reinforce_multi_rwd2go_alt(env, policies, optimizers)
scores = np.array(scores)

# Two policies + optimizers
#policies = [LinearPolicy(state_size=2) for _ in range(env.n_agents)]
#optimizers = [optim.Adam(p.parameters(), lr=0.0001) for p in policies]

#scores = reinforce_multi_rwd2go_seq_stop(env, policies, optimizers)
#scores = np.array(scores)
#solution1(env, scores, policies) 

# Two policies + optimizers
#policies = [LinearPolicy(state_size=2) for _ in range(env.n_agents)]
#optimizers = [optim.Adam(p.parameters(), lr=0.0001) for p in policies]

#scores = reinforce_multi_rwd2go_seq(env, policies, optimizers,  n_episodes=15000)
#scores = np.array(scores)
#solution1(env, scores, policies) 



1   [92m0[0m    
  #     #
    #    
[94m1[0m     # #
  # 0    
----------

=== Round 1 ===

--- Training Agent 0 ---
 Agent0 Episode 10 avgR=-0.2530
 
 Agent0 Episode 20 avgR=0.1278
 
 Agent0 Episode 30 avgR=0.5158
 
 Agent0 Episode 40 avgR=0.5970
 
 Agent0 Episode 50 avgR=0.6342
 
 Agent0 Episode 60 avgR=0.6558
 
 Agent0 Episode 70 avgR=0.6620
 
 Agent0 Episode 80 avgR=0.6758
 
 Agent0 Episode 90 avgR=0.6932
 
 Agent0 Episode 100 avgR=0.6968
 

--- Training Agent 1 ---
 Agent1 Episode 10 avgR=-0.1244
 
 Agent1 Episode 20 avgR=0.8866
 
 Agent1 Episode 30 avgR=0.9890
 
 Agent1 Episode 40 avgR=0.9990
 
 Agent1 Episode 50 avgR=0.9988
 
 Agent1 Episode 60 avgR=0.9998
 
  âœ“ Policy parameters stabilized (change < 0.001)
  ðŸŽ¯ AGENT 1 CONVERGED! ðŸŽ¯
  Policy criteria satisfied
âœ“ Agent 1 converged after 68 episodes
1   [92m0[0m    
  #     #
    #    
[94m1[0m     # #
  # 0    
----------
    [92m0[0m    
1 #     #
    #    
[94m1[0m   0 # #
  #      
----------
    [92m0[