In [None]:

# This is our Wrapper
import ns_gym as nsg

# Of course we need to import gymnasium
import gymnasium as gym

# Other useful packages for analysis and visualization
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time
from ns_gym.utils import type_mismatch_checker
import copy

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

%matplotlib inline

In [None]:
from ns_gym.schedulers import ContinuousScheduler
from ns_gym.update_functions import DistributionDecrementUpdate
from ns_gym.wrappers import NSFrozenLakeWrapper

param = 'P'
env = gym.make('FrozenLake-v1',render_mode="rgb_array",max_episode_steps=50)


### Define the scheduler #######
scheduler = ContinuousScheduler() #Update the slipperiness at each timestep

#### Define the update function #####
update_function = DistributionDecrementUpdate(scheduler=scheduler,k = 0.1) #Decrement the slipperiness by 0.1 at each timestep where the scheduler fires true

# Map parameter to update function

params = {param:update_function}

custom_ns_env = NSFrozenLakeWrapper(env,
                             params,
                             change_notification=True,
                             delta_change_notification=True,
                             initial_prob_dist=[1,0,0])

## PPO with MCTS

In [None]:
from ns_gym.benchmark_algorithms import MCTS

def run_episode(env, max_steps=50):
    """
        Run an episode in the environment, collecting states, actions, rewards, and other data.
    """
    state = env.reset()[0]['state']
 
    for step in range(max_steps):
        action = env.action_space.sample()  # Sample a random action
        next_state, reward, done, truncated, info = env.step(action)
        
        print(f"Step: {step}, State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}, Done: {done}, Truncated: {truncated}")
        
        state = next_state
        if done or truncated:
            break


In [None]:
# agent = MCTS(custom_ns_env, num_simulations=100)

# run_episode(custom_ns_env)

## PPO with Frozen Lake, with Stable Baseline

In [None]:
# def make_planning_env():
#     env = custom_ns_env.get_planning_env()   # returns NSFrozenLakeWrapper
#     # env = KeepStateOnly(env)                 # <- flatten/strip to avoid nested obs
#     # print("Planning Env", env)
#     return env


def make_planning_env_v2():
    # env = custom_ns_env.get_planning_env() 
    en = custom_ns_env.unwrapped
    return en

custom_ns_env.reset()
vec_env = make_vec_env(env_id=make_planning_env_v2, n_envs=1)



# custom_ns_env.reset()

# vec_env = make_vec_env(env_id=make_planning_env, n_envs=1)
agent = PPO("MlpPolicy", vec_env)
agent.learn(total_timesteps=25000)
# agent.save("ppo_frozenlake")


## PPO with Frozen Lake, with `PPO.py`

In [None]:
# PPO (ns_gym.benchmark_algorithms) on the FrozenLake custom_ns_env
from ns_gym.benchmark_algorithms import PPO, PPOActor, PPOCritic

custom_ns_env.reset()
s_dim = 1  # FrozenLake state is discrete scalar
a_dim = custom_ns_env.action_space.n

actor = PPOActor(s_dim=s_dim, a_dim=a_dim, hidden_size=64, is_discrete=True)
critic = PPOCritic(s_dim=s_dim, hidden_size=64)
agent = PPO(actor, critic, lr_policy=3e-4, lr_critic=4e-4, clip_val=0.2)

config = {
    "env_name": "FrozenLake_custom_ns_env",
    "max_episodes": 50,
    "batch_size": 1024,
    "minibatch_size": 64,
    "n_epochs": 10,
    "hidden_size": 64,
    "max_steps": 50,
    "gamma": 0.99,
    "lamb": 0.95,
    "device": "cpu",
    "lr_policy": 3e-4,
    "lr_critic": 4e-4,
    "max_grad_norm": 0.5,
    "clip_val": 0.2,
    "ent_weight": 0.0,
    "save_path": "./",
    "s_dim": s_dim,
    "a_dim": a_dim,
}

best_reward = agent.train_ppo(custom_ns_env, config)
print("Best running reward:", best_reward)


In [None]:
# PPO (ns_gym.benchmark_algorithms) example
# NOTE: PPO implementation here assumes continuous actions.
ppo_env = gym.make("Pendulum-v1")

s_dim = ppo_env.observation_space.shape[0]
a_dim = ppo_env.action_space.shape[0]

actor = PPOActor(s_dim=s_dim, a_dim=a_dim, hidden_size=64)
critic = PPOCritic(s_dim=s_dim, hidden_size=64)
agent = PPO(actor, critic, lr_policy=3e-4, lr_critic=4e-4, clip_val=0.2)

config = {
    "env_name": "Pendulum-v1",
    "max_episodes": 50,
    "batch_size": 2048,
    "minibatch_size": 64,
    "n_epochs": 10,
    "hidden_size": 64,
    "max_steps": 200,
    "gamma": 0.99,
    "lamb": 0.95,
    "device": "cpu",
    "lr_policy": 3e-4,
    "lr_critic": 4e-4,
    "max_grad_norm": 0.5,
    "clip_val": 0.2,
    "ent_weight": 0.0,
    "save_path": "./",
}

best_reward = agent.train_ppo(ppo_env, config)
print("Best running reward:", best_reward)
