# PPO Tests Across NS-Gym Environments

This is a testground for PPO on multiple environments.

Tested Environments:
- FrozenLake-v1
- CliffWalking-v1
- CartPole-v1
- Acrobot-v1
- MountainCar-v0
- MountainCarContinuous-v0
- Pendulum-v1


In [1]:
import os
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

from ns_gym.schedulers import ContinuousScheduler
from ns_gym.update_functions import IncrementUpdate, DistributionDecrementUpdate
from ns_gym.wrappers import NSClassicControlWrapper, NSFrozenLakeWrapper, NSCliffWalkingWrapper
from ns_gym.benchmark_algorithms import PPO, PPOActor, PPOCritic


In [2]:
# Runtime knobs
MAX_EPISODES_PER_ENV = 30
WEIGHTS_DIR = "./ppo_test_weights"
os.makedirs(WEIGHTS_DIR, exist_ok=True)

COMMON_CONFIG = {
    "batch_size": 2048,
    "minibatch_size": 64,
    "n_epochs": 10,
    "hidden_size": 64,
    "gamma": 0.99,
    "lamb": 0.95,
    "device": "cpu",
    "lr_policy": 3e-4,
    "lr_critic": 4e-4,
    "max_grad_norm": 0.5,
    "clip_val": 0.2,
    "ent_weight": 0.0,
}

print(f"Saving weights to: {WEIGHTS_DIR}")
print(f"Episodes per environment: {MAX_EPISODES_PER_ENV}")


Saving weights to: ./ppo_test_weights
Episodes per environment: 30


In [3]:
def make_frozenlake_env(render_mode=None):
    env_kwargs = {
        "is_slippery": False,
        "max_episode_steps": 100,
    }
    if render_mode is not None:
        env_kwargs["render_mode"] = render_mode

    env = gym.make("FrozenLake-v1", **env_kwargs)
    scheduler = ContinuousScheduler()
    update_fn = DistributionDecrementUpdate(scheduler=scheduler, k=0.01)
    return NSFrozenLakeWrapper(
        env,
        {"P": update_fn},
        change_notification=True,
        delta_change_notification=True,
        initial_prob_dist=[1, 0, 0],
    )


def make_cliffwalking_env():
    env = gym.make("CliffWalking-v1", max_episode_steps=200)
    scheduler = ContinuousScheduler()
    update_fn = DistributionDecrementUpdate(scheduler=scheduler, k=0.01)
    return NSCliffWalkingWrapper(
        env,
        {"P": update_fn},
        change_notification=True,
        delta_change_notification=True,
        initial_prob_dist=[1, 0, 0, 0],
    )


def make_classic_control_env(env_id, tunable_param, k):
    env = gym.make(env_id)
    scheduler = ContinuousScheduler()
    update_fn = IncrementUpdate(scheduler=scheduler, k=k)
    return NSClassicControlWrapper(
        env,
        {tunable_param: update_fn},
        change_notification=True,
        delta_change_notification=True,
    )


In [4]:
ENV_SPECS = [
    {
        "name": "FrozenLake-v1",
        "make_env": make_frozenlake_env,
        "max_steps": 100,
    },
    {
        "name": "CliffWalking-v1",
        "make_env": make_cliffwalking_env,
        "max_steps": 200,
    },
    {
        "name": "CartPole-v1",
        "make_env": lambda: make_classic_control_env("CartPole-v1", "masspole", 1e-3),
        "max_steps": 500,
    },
    {
        "name": "Acrobot-v1",
        "make_env": lambda: make_classic_control_env("Acrobot-v1", "LINK_MASS_2", 1e-3),
        "max_steps": 500,
    },
    {
        "name": "MountainCar-v0",
        "make_env": lambda: make_classic_control_env("MountainCar-v0", "force", 5e-5),
        "max_steps": 200,
    },
    {
        "name": "MountainCarContinuous-v0",
        "make_env": lambda: make_classic_control_env("MountainCarContinuous-v0", "power", 5e-5),
        "max_steps": 999,
    },
    {
        "name": "Pendulum-v1",
        "make_env": lambda: make_classic_control_env("Pendulum-v1", "m", 1e-3),
        "max_steps": 200,
    },
]

[spec["name"] for spec in ENV_SPECS]


['FrozenLake-v1',
 'CliffWalking-v1',
 'CartPole-v1',
 'Acrobot-v1',
 'MountainCar-v0',
 'MountainCarContinuous-v0',
 'Pendulum-v1']

In [5]:
def infer_state_dim_from_obs(obs):
    # NS-wrappers return dict, but PPO only wants the "state" part.
    if isinstance(obs, dict) and "state" in obs:
        obs = obs["state"]

    obs_arr = np.asarray(obs, dtype=np.float32)
    if obs_arr.ndim == 0:
        return 1
    return int(obs_arr.reshape(-1).shape[0])


def infer_state_dim_from_space(observation_space):
    # Fallback path if reset-based inference is unavailable.
    if isinstance(observation_space, gym.spaces.Dict) and "state" in observation_space.spaces:
        return infer_state_dim_from_space(observation_space.spaces["state"])

    if hasattr(observation_space, "shape") and observation_space.shape is not None and len(observation_space.shape) > 0:
        return int(np.prod(observation_space.shape))

    return 1


def infer_action_dim_and_type(action_space):
    if hasattr(action_space, "n"):
        return int(action_space.n), True
    return int(np.prod(action_space.shape)), False


def train_single_env(spec, max_episodes=MAX_EPISODES_PER_ENV):
    env = spec["make_env"]()

    # Prefer runtime observation-based inference to avoid Dict-space mismatches.
    sample_obs, _ = env.reset(seed=0)
    s_dim = infer_state_dim_from_obs(sample_obs)
    if s_dim <= 0:
        s_dim = infer_state_dim_from_space(env.observation_space)

    a_dim, is_discrete = infer_action_dim_and_type(env.action_space)

    actor = PPOActor(
        s_dim=s_dim,
        a_dim=a_dim,
        hidden_size=COMMON_CONFIG["hidden_size"],
        is_discrete=is_discrete,
    )
    critic = PPOCritic(s_dim=s_dim, hidden_size=COMMON_CONFIG["hidden_size"])
    agent = PPO(
        actor,
        critic,
        lr_policy=COMMON_CONFIG["lr_policy"],
        lr_critic=COMMON_CONFIG["lr_critic"],
        clip_val=COMMON_CONFIG["clip_val"],
    )

    config = dict(COMMON_CONFIG)
    config.update(
        {
            "env_name": spec["name"].replace("/", "_"),
            "max_episodes": max_episodes,
            "max_steps": spec["max_steps"],
            "save_path": WEIGHTS_DIR + "/",
            "s_dim": s_dim,
            "a_dim": a_dim,
        }
    )

    best_reward = agent.train_ppo(env, config)
    env.close()

    return {
        "env": spec["name"],
        "is_discrete_action": is_discrete,
        "s_dim": s_dim,
        "a_dim": a_dim,
        "best_reward": float(best_reward),
    }



In [6]:
results = []

for spec in ENV_SPECS:
    env_name = spec["name"]
    print(f"\n===== Training PPO on {env_name} =====")

    try:
        out = train_single_env(spec, max_episodes=MAX_EPISODES_PER_ENV)
        out["status"] = "ok"
        out["error"] = ""
        results.append(out)
    except Exception as exc:
        results.append({
            "env": env_name,
            "is_discrete_action": None,
            "s_dim": None,
            "a_dim": None,
            "best_reward": None,
            "status": "failed",
            "error": f"{type(exc).__name__}: {exc}",
        })
        print(f"FAILED: {type(exc).__name__}: {exc}")

results



===== Training PPO on FrozenLake-v1 =====
[Episode    0] reward = 0.0, mean_100 = 0.0, pg_loss = -0.098, v_loss = 0.039
[Episode    1] reward = 0.0, mean_100 = 0.0, pg_loss = -0.024, v_loss = 0.001
[Episode    2] reward = 0.0, mean_100 = 0.0, pg_loss = -0.052, v_loss = 0.001
[Episode    3] reward = 0.0, mean_100 = 0.0, pg_loss = -0.017, v_loss = 0.001
[Episode    4] reward = 0.0, mean_100 = 0.0, pg_loss = -0.066, v_loss = 0.001
[Episode    5] reward = 0.0, mean_100 = 0.0, pg_loss = -0.059, v_loss = 0.000
[Episode    6] reward = 0.0, mean_100 = 0.0, pg_loss = -0.007, v_loss = 0.000
[Episode    7] reward = 0.0, mean_100 = 0.0, pg_loss = -0.023, v_loss = 0.000
[Episode    8] reward = 0.0, mean_100 = 0.0, pg_loss = -0.019, v_loss = 0.000
[Episode    9] reward = 0.0, mean_100 = 0.0, pg_loss = -0.036, v_loss = 0.000
[Episode   10] reward = 0.0, mean_100 = 0.0, pg_loss = -0.042, v_loss = 0.000
[Episode   11] reward = 0.0, mean_100 = 0.0, pg_loss = -0.076, v_loss = 0.000
[Episode   12] reward

[{'env': 'FrozenLake-v1',
  'is_discrete_action': True,
  's_dim': 1,
  'a_dim': 4,
  'best_reward': 0.0,
  'status': 'ok',
  'error': ''},
 {'env': 'CliffWalking-v1',
  'is_discrete_action': True,
  's_dim': 1,
  'a_dim': 4,
  'best_reward': -1190.0,
  'status': 'ok',
  'error': ''},
 {'env': 'CartPole-v1',
  'is_discrete_action': True,
  's_dim': 4,
  'a_dim': 2,
  'best_reward': 30.0,
  'status': 'ok',
  'error': ''},
 {'env': 'Acrobot-v1',
  'is_discrete_action': True,
  's_dim': 6,
  'a_dim': 3,
  'best_reward': -500.0,
  'status': 'ok',
  'error': ''},
 {'env': 'MountainCar-v0',
  'is_discrete_action': True,
  's_dim': 2,
  'a_dim': 3,
  'best_reward': -157.39999389648438,
  'status': 'ok',
  'error': ''},
 {'env': 'MountainCarContinuous-v0',
  'is_discrete_action': False,
  's_dim': 2,
  'a_dim': 1,
  'best_reward': 82.12165832519531,
  'status': 'ok',
  'error': ''},
 {'env': 'Pendulum-v1',
  'is_discrete_action': False,
  's_dim': 3,
  'a_dim': 1,
  'best_reward': -1243.018920

In [7]:
def print_results_table(rows):
    header = f"{'Environment':<28} {'ActionType':<12} {'BestReward':<14} {'Status':<8}"
    print(header)
    print('-' * len(header))

    for row in rows:
        action_type = ("discrete" if row["is_discrete_action"] else "continuous") if row["is_discrete_action"] is not None else "-"
        reward = f"{row['best_reward']:.3f}" if row["best_reward"] is not None else "-"
        print(f"{row['env']:<28} {action_type:<12} {reward:<14} {row['status']:<8}")


print_results_table(results)

failed = [row for row in results if row['status'] != 'ok']
if failed:
    print('\nFailures:')
    for row in failed:
        print(f"- {row['env']}: {row['error']}")


Environment                  ActionType   BestReward     Status  
-----------------------------------------------------------------
FrozenLake-v1                discrete     0.000          ok      
CliffWalking-v1              discrete     -1190.000      ok      
CartPole-v1                  discrete     30.000         ok      
Acrobot-v1                   discrete     -500.000       ok      
MountainCar-v0               discrete     -157.400       ok      
MountainCarContinuous-v0     continuous   82.122         ok      
Pendulum-v1                  continuous   -1243.019      ok      


Some experiment failed miserably, but some is doing somewhat well. This is because PPO does not account for non-stationarity. Changes in the environment kills the prior belief in the algorithm.

Particularly, FrozenLake did exceptionally bad. Here is GPT's response to this:
1. FrozenLake reward is extremely sparse (only rewarded 1 when reaches the end)
2. Non-stationarity is too strong.
3. PPO does not use non-stationarity signals