In [1]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

env_name = 'CartPole-v1'
# Define environment
env = DummyVecEnv([lambda: gym.make(env_name)])

# Define an evaluation callback to assess the agent's performance during training
eval_callback = EvalCallback(
    env,
    eval_freq=500,  # Evaluate every 500 steps
    n_eval_episodes=10,  # Number of episodes to evaluate
    deterministic=True,  # Use deterministic policy for evaluation
    callback_on_new_best=None,  # Callback when a new best model is found,
    verbose=0
)

# Create and train A2C model
model_vanilla = A2C("MlpPolicy", env,gae_lambda=0.92, verbose=0)
model_vanilla.learn(total_timesteps=10000,callback=eval_callback)


# Collect samples using the trained model
obs = env.reset()
values, _ = model_vanilla.predict(obs)
advantage_vanilla = values.squeeze()
print("Vanilla Advantage:", advantage_vanilla)


eval_env = gym.make(env_name, render_mode="rgb_array")
vanilla_mean_reward, vanilla_std_reward = evaluate_policy(model_vanilla, eval_env, n_eval_episodes=100)
 
print(f"mean_reward:{vanilla_mean_reward:.2f} +/- {vanilla_std_reward:.2f}")



Vanilla Advantage: 1
mean_reward:204.99 +/- 76.37


In [2]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

env_name = 'CartPole-v1'
# Define environment
env = DummyVecEnv([lambda: gym.make(env_name)])

# Define an evaluation callback to assess the agent's performance during training
eval_callback = EvalCallback(
    env,
    eval_freq=500,  # Evaluate every 500 steps
    n_eval_episodes=10,  # Number of episodes to evaluate
    deterministic=True,  # Use deterministic policy for evaluation
    callback_on_new_best=None,  # Callback when a new best model is found,
    verbose=0
)


# Create and train A2C model with n-step advantages and returns
n_steps = 5
model_nstep = A2C("MlpPolicy", env, n_steps=n_steps,gae_lambda=0.98, verbose=0)
model_vanilla.learn(total_timesteps=10000,callback=eval_callback)

# Collect samples using the trained model
obs = env.reset()
values, _ = model_nstep.predict(obs)
advantage_nstep = values.squeeze()
print("n-step Advantage:", advantage_nstep)

eval_env = gym.make(env_name, render_mode="rgb_array")
n_steps_mean_reward, n_steps_std_reward = evaluate_policy(model_vanilla, eval_env, n_eval_episodes=100)

print(f"mean_reward:{n_steps_mean_reward:.2f} +/- {n_steps_std_reward:.2f}")


n-step Advantage: 1
mean_reward:280.75 +/- 57.85


In [3]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

env_name = 'CartPole-v1'
# Define environment
env = DummyVecEnv([lambda: gym.make(env_name)])

# Define an evaluation callback to assess the agent's performance during training
eval_callback = EvalCallback(
    env,
    eval_freq=500,  # Evaluate every 500 steps
    n_eval_episodes=10,  # Number of episodes to evaluate
    deterministic=True,  # Use deterministic policy for evaluation
    callback_on_new_best=None,  # Callback when a new best model is found,
    verbose=0
)



# Define environment
env = DummyVecEnv([lambda: gym.make('CartPole-v1')])

# Create and train A2C model with Monte Carlo advantages
model_mc = A2C("MlpPolicy", env, n_steps=1,gae_lambda=0.96, verbose=0)
model_mc.learn(total_timesteps=10000, callback=eval_callback)

# Collect samples using the trained model
obs = env.reset()
values, _ = model_mc.predict(obs)
advantage_mc = values.squeeze()
print("Monte Carlo Advantage:", advantage_mc)



eval_env = gym.make(env_name, render_mode="rgb_array")
mc_mean_reward, mc_std_reward = evaluate_policy(model_vanilla, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mc_mean_reward:.2f} +/- {mc_std_reward:.2f}")


Monte Carlo Advantage: 1
mean_reward:275.77 +/- 59.40


In [4]:
print(f"vanilla_mean_reward:{vanilla_mean_reward:.2f} +/- {vanilla_std_reward:.2f}")
print(f"n_steps_mean_reward:{n_steps_mean_reward:.2f} +/- {n_steps_std_reward:.2f}")
print(f"mc_mean_reward:{mc_mean_reward:.2f} +/- {mc_std_reward:.2f}")

vanilla_mean_reward:204.99 +/- 76.37
n_steps_mean_reward:280.75 +/- 57.85
mc_mean_reward:275.77 +/- 59.40
