In [1]:
# Standard libraries
import time
import multiprocessing
import numpy as np
import argparse

# PyTorch libraries
import torch
import torch.nn as nn

# Third-party libraries
from tqdm import tqdm, trange
import wandb
import gymnasium as gym
from gymnasium import spaces

# Stable baseline imports
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import BaseCallback

# Local imports
from config import Config

# Overcooked AI imports
from overcooked_ai_py.mdp.actions import Action
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv
from overcooked_ai_py.agents.agent import Agent, AgentPair

# Helper function imports
from ppo_helper import get_observation

# Constants
MAX_WIDTH = 5  # 9
MAX_HEIGHT = 4  # 5
NUM_AGENTS = 2
INPUT_CHANNELS = 26
ACTION_SPACE_SIZE = 6

2025-03-11 22:25:00.289640: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class OvercookedSBGym(gym.Env):
    """
    Wrapper for the Overcooked environment to be compatible with Stable Baselines3.
    This version supports multi-agent (AgentPair) interactions.
    """

    def __init__(self):
        super(OvercookedSBGym, self).__init__()

        self.config = Config()

        mdp = OvercookedGridworld.from_layout_name(layout_name=self.config.layout)
        env = OvercookedEnv.from_mdp(mdp, horizon=self.config.horizon)

        self.base_env = env
        self.featurize_fn = get_observation

        self.observation_space = self._get_observation_space()
        self.action_space = self._get_action_space()

        self.reward_shaping_scaler = 1.0

        self.reset()

    def set_reward_shaping_scaler(self, reward_shaping_scaler):
        self.reward_shaping_scaler = reward_shaping_scaler

    def _get_observation_space(self):
        """
        Define the observation space for joint observations (both agents).
        """

        dummy_mdp = self.base_env.mdp 
        dummy_state = dummy_mdp.get_standard_start_state()
        obs_shape = self.featurize_fn(self.base_env, dummy_state)[0].shape
        high = np.ones(obs_shape, dtype=np.float32) * float("inf")
        low = np.zeros(obs_shape, dtype=np.float32)
        return spaces.Box(low, high, dtype=np.float32)

        # return spaces.Box(
        #     low=0,
        #     high=1,
        #     shape=(NUM_AGENTS * INPUT_CHANNELS, MAX_WIDTH, MAX_HEIGHT),
        #     dtype=np.float32,
        # )

    def _get_action_space(self):
        """
        Define the action space for joint actions (both agents).
        """
        return spaces.Discrete(ACTION_SPACE_SIZE)
    
    def step(self, action):
        """
        Execute one time step within the environment.
        Returns joint observations, rewards, and done flags.

        Assuming action is a MultiDiscrete tuple of (player0_action, player1_action).
        """
        # Convert joint action to individual actions
        agent0_action = Action.INDEX_TO_ACTION[action]
        agent1_action = Action.INDEX_TO_ACTION[4]

        
        joint_action = (agent0_action, agent1_action)

        # Step the environment
        next_state, reward, done, env_info = self.base_env.step(joint_action, display_phi=True)
        if reward > 0:
            print(reward, joint_action, next_state)

        shaped_reward = reward + self.reward_shaping_scaler * env_info["shaped_r_by_agent"][0]

        # shaped_reward = reward + env_info['phi_s_prime'] - env_info['phi_s']
        obs_agent0, obs_agent1 = self.featurize_fn(self.base_env, next_state)

        # both_agents_ob = (obs_agent0, obs_agent1)
        # obs = {
        #     "both_agent_obs": both_agents_ob,
        #     "overcooked_state": next_state,
        #     "other_agent_env_idx": 1 - self.agent_idx,
        # }

        # for now, lets just use agent0 observation
        obs = obs_agent0
        # print("OBS SHAPE", obs.shape)

        truncated = False

    
        return obs, shaped_reward, done, truncated, env_info
    
    def reset(self, seed=None, options=None):
        """
        When training on individual maps, we want to randomize which agent is assigned to which
        starting location, in order to make sure that the agents are trained to be able to
        complete the task starting at either of the hardcoded positions.

        NOTE: a nicer way to do this would be to just randomize starting positions, and not
        have to deal with randomizing indices.
        """
        self.base_env.reset(regen_mdp=False)
        self.mdp = self.base_env.mdp
        # self.agent_idx = np.random.choice([0, 1])
        ob_p0, ob_p1 = self.featurize_fn(self.base_env, self.base_env.state)

        # if self.agent_idx == 0:
        #     both_agents_ob = np.concatenate([ob_p0, ob_p1], axis=0).astype(np.float32)
        #     # both_agents_ob = (ob_p0, ob_p1)
        # else:
        #     both_agents_ob = np.concatenate([ob_p1, ob_p0], axis=0).astype(np.float32)
        #     # both_agents_ob = (ob_p1, ob_p0)
            
        # return {
        #     "both_agent_obs": both_agents_ob,
        #     "overcooked_state": self.base_env.state,
        #     "other_agent_env_idx": 1 - self.agent_idx,
        # }

        # for now, just use agent0_obs for everyting
        
        infos = {}
        return ob_p0, infos #, {"overcooked_state": self.base_env.state, "other_agent_env_idx": 1 - self.agent_idx}


class CustomNetwork(BaseFeaturesExtractor):
    """
    Custom neural network for processing joint observations and outputting actions for both agents.
    """

    def __init__(self, observation_space, features_dim):
        super(CustomNetwork, self).__init__(observation_space, features_dim)
        self.network = nn.Sequential(
            nn.Conv2d(in_channels=INPUT_CHANNELS, out_channels=25, kernel_size=5, padding="same"),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=25, out_channels=25, kernel_size=3, padding="valid"),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=25, out_channels=25, kernel_size=3, padding="same"),
            nn.LeakyReLU(),
            nn.Flatten(),
            nn.Linear(150, 32),
            nn.LeakyReLU(),
            nn.Linear(32, 32),
            nn.LeakyReLU(),
            nn.Linear(32, features_dim),
            nn.LeakyReLU(),
        )

    def forward(self, observations):
        return self.network(observations)


class CustomCallback(BaseCallback):
    def __init__(self, initial_entropy_coeff, final_entropy_coeff, entropy_horizon, reward_shaping_horizon, verbose: int = 0):
        super(CustomCallback, self).__init__(verbose)
        self.initial_entropy_coeff = initial_entropy_coeff
        self.final_entropy_coeff = final_entropy_coeff
        self.entropy_horizon = entropy_horizon
        self.reward_shaping_horizon = reward_shaping_horizon
    
    def _on_step(self) -> bool:
        timestep = self.num_timesteps
        entropy_fraction = min(1, timestep / self.entropy_horizon)
        entropy_coeff = self.initial_entropy_coeff + entropy_fraction * (self.final_entropy_coeff - self.initial_entropy_coeff)
        self.model.ent_coef = entropy_coeff

        reward_shaping_fraction = min(1, timestep / self.reward_shaping_horizon)
        reward_shaping_coeff = 1 - reward_shaping_fraction 

        self.training_env.env_method("set_reward_shaping_scaler", reward_shaping_coeff)

        if timestep % 12000 == 0:
            self.logger.log(f"Timestep: {timestep}, Entropy Coefficient: {entropy_coeff}, Reward Shaping Coefficient: {reward_shaping_coeff}")

        return True

In [3]:
config=Config()      

# Initialize environment
env = OvercookedSBGym()

# Create a vectorized environment
vec_env = make_vec_env(OvercookedSBGym, n_envs=config.num_episodes, vec_env_cls=SubprocVecEnv) #, vec_env_kwargs=dict(start_method="fork"))

custom_callback = CustomCallback(config.entropy_coeff_start, config.entropy_coeff_end, config.entropy_coeff_horizon, config.reward_shaping_horizon)

# Initialize PPO agent with a custom network
policy_kwargs = dict(
    features_extractor_class=CustomNetwork,
    features_extractor_kwargs=dict(features_dim=32),
    net_arch=dict(pi=[], vf=[]),
    share_features_extractor=True,
)
agent = PPO(
    "CnnPolicy",
    # env,
    vec_env,
    verbose=1,
    device="cpu",
    policy_kwargs=policy_kwargs,
    learning_rate=config.learning_rate,
    n_steps=config.horizon,
    batch_size=(config.horizon * config.num_episodes) // config.num_mini_batches,
    n_epochs=config.num_epochs,
    gamma=config.gae_gamma,
    gae_lambda=config.gae_lambda,
    clip_range=config.clip_param,
    ent_coef=config.entropy_coeff_start,
    max_grad_norm=config.max_grad_norm,
    vf_coef=config.vf_loss_coeff,
)

2025-03-11 22:25:11.219009: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-11 22:25:16.005352: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-11 22:25:16.134172: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-11 22:25:16.176410: I tensorflow/core/pla

Using cpu device


  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/

In [6]:
agent._setup_learn(30*400, custom_callback, True, "PPO", False)

(12000, <__main__.CustomCallback at 0x7f866f1c2790>)

In [7]:
agent.collect_rollouts(n_episodes=30*400)

TypeError: collect_rollouts() got an unexpected keyword argument 'n_episodes'

In [8]:
agent.rollout_buffer.observations.shape

(400, 30, 26, 5, 4)

In [9]:
rewards = agent.rollout_buffer.rewards.reshape((config.num_episodes, config.horizon))
values = agent.rollout_buffer.values.reshape((config.num_episodes, config.horizon))



dones = np.zeros_like(rewards)
dones[:, -1] = 1

In [10]:
rewards = torch.tensor(rewards)
values = torch.tensor(values)   
dones = torch.tensor(dones)


In [11]:
agent.rollout_buffer.buffer_size

400

In [12]:
last_values = last_values.clone().cpu().numpy().flatten()  # type: ignore[assignment]

last_gae_lam = 0
for step in reversed(range(self.buffer_size)):
    if step == self.buffer_size - 1:
        next_non_terminal = 1.0 - dones.astype(np.float32)
        next_values = last_values
    else:
        next_non_terminal = 1.0 - self.episode_starts[step + 1]
        next_values = self.values[step + 1]
    delta = self.rewards[step] + self.gamma * next_values * next_non_terminal - self.values[step]
    last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
    self.advantages[step] = last_gae_lam
# TD(lambda) estimator, see Github PR #375 or "Telescoping in TD(lambda)"
# in David Silver Lecture 4: https://www.youtube.com/watch?v=PnHCvfgC_ZA
self.returns = self.advantages + self.values

NameError: name 'last_values' is not defined

In [34]:
def compute_GAE(rewards, values, dones):
    """
    Computes the generalized advantage estimator (GAE), a weighted sum of future TD errors. 
    GAE reduces the variance of policy gradient estimates to improve efficiency and stability of training. 
    
    Advantage Function: A(s_t, a_t) = Q(s_t, a_t) - V(s_t), measures how much better or worse action a is compared to the expected value of the state.

    Returns advantages, which contains the GAE for each timestep in the trajectory.
    """
    # Number of timesteps
    total_time_steps = len(dones)

    # Advantages, initialized to 0
    advantages = torch.zeros_like(rewards)

    # The next GAE value to propagate back 
    next_gae = torch.zeros_like(rewards[-1])
    
    for ts in reversed(range(total_time_steps)):
        if ts == total_time_steps - 1:
            # For the last timestep, the bootstrap value should be zero only if the episode terminates
            bootstrap_value = values[ts] * (1 - dones[ts])
            delta = rewards[ts] + config.gae_gamma * bootstrap_value - values[ts]
        else:
            # Calculate TD error for each timestep 
            delta = rewards[ts] + config.gae_gamma * values[ts + 1] * (1 - dones[ts]) - values[ts]
        
        # Update the advantage for the current timestep using the recursive formula
        advantages[ts] = delta + config.gae_gamma * config.gae_lambda * next_gae * (1 - dones[ts])

        # Update next_gae for the next iteration
        next_gae = advantages[ts]
    
    return advantages