In [None]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# 1. Import dependencies

In [4]:
%pip install "stable-baselines3[extra]"

Collecting stable-baselines3[extra]
  Using cached stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting gymnasium<1.3.0,>=0.29.1 (from stable-baselines3[extra])
  Using cached gymnasium-1.2.1-py3-none-any.whl.metadata (10.0 kB)
Collecting numpy<3.0,>=1.20 (from stable-baselines3[extra])
  Using cached numpy-2.3.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting torch<3.0,>=2.3 (from stable-baselines3[extra])
  Using cached torch-2.8.0-cp311-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting cloudpickle (from stable-baselines3[extra])
  Using cached cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pandas (from stable-baselines3[extra])
  Using cached pandas-2.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting matplotlib (from stable-baselines3[extra])
  Using cached matplotlib-3.10.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting opencv-python (from stable-baselines3[extra])
  Using cached opencv_python-4.12.

In [20]:
import os
import gymnasium as gym 
from stable_baselines3 import PPO # Proximal Policy Optimization
from stable_baselines3.common.vec_env import DummyVecEnv # Wrapper to work with vectorized environments
from stable_baselines3.common.evaluation import evaluate_policy # Helper to evaluate the agent

# 2. Load Environment

In [4]:
# Create environment
environment_name = "CartPole-v1"
env = gym.make(environment_name)

In [28]:
# Run the environment for 5 episodes
# Note: In Gymnasium, 'done' is split into 'terminated' and 'truncated'
# 'terminated' indicates if the episode ended due to a terminal state
# 'truncated' indicates if the episode was cut off due to a time limit or other constraints
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset() # reset() returns a tuple (observation, info)
    done = False
    score = 0 # initialize the score for the episode
    
    while not done:
        env.render() # render the environment
        action = env.action_space.sample() # take a random action from action space (0/1)
        n_state, reward, terminated, truncated, info = env.step(action) # take the action and get the next state and reward
        done = terminated or truncated
        score += reward # update the score
    print('Episode:{} Score:{}'.format(episode, score))
    
env.close() # close the environment



IndexError: invalid index to scalar variable.

# Understanding The Environment
https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [6]:
# 0-push cart to left, 1-push cart to the right
print(env.action_space)
env.action_space.sample()

Discrete(2)


np.int64(1)

In [7]:
# [cart position, cart velocity, pole angle, pole angular velocity]
print(env.observation_space)
env.observation_space.sample()

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)


array([ 4.0060825 ,  0.42791492, -0.18488151,  1.586126  ], dtype=float32)

# 3. Train an RL Model

In [8]:
# Make the directories for saving the log
log_path = os.path.join('Training', 'Logs')

env = gym.make(environment_name) # Create the environment
env = DummyVecEnv([lambda: env]) # Wrap the environment in a vectorized environment
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path) # Create the agent using PPO algorithm and MLP policy

Using cpu device


In [9]:
model.learn(total_timesteps=20000) # Train the agent for 20,000 time steps

Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 6422 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 4392        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008151939 |
|    clip_fraction        | 0.0658      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.00085     |
|    learning_rate        | 0.0003      |
|    loss                 | 4.81        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.01       |
|    value_loss           | 51.9        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x1209e8450>

# 4. Save and Reload Model

In [10]:
# Path to save the model
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model_cartpole')

In [11]:
# Save the model
model.save(PPO_path)

In [None]:
del model

In [None]:
model = PPO.load('PPO_model', env=env)

# 4. Evaluation

In [34]:
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
# Evaluate the agent and render the environment (Use the next block for no-rendering evaluation)
from stable_baselines3.common.monitor import Monitor # To record the episode statistics

eval_env = Monitor(gym.make(environment_name, render_mode="human")) # Create a separate environment for evaluation
# Evaluate the agent for 10 episodes and render the environment
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, render=True)
print(f'Mean reward: {mean_reward} +/- {std_reward}')

Mean reward: 500.0 +/- 0.0


In [12]:
eval_env = gym.make(environment_name)
# Evaluate the agent for 10 episodes and don't render the environment
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, render=False)
print(f'Mean reward: {mean_reward} +/- {std_reward}')



Mean reward: 344.0 +/- 38.10774199555781


In [13]:
env.close()

# 5. Test Model

In [None]:
# Test the trained agent using the learned policy
for episode in range(5):
    obs = env.reset()
    score = 0
    done = False
    while not done:
        action, _states = model.predict(obs) # Predict the action using the learned policy
        obs, reward, done, info = env.step(action) # env is already vectorized and env.step returns 4 values
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:[418.]
Episode:1 Score:[262.]
Episode:1 Score:[200.]
Episode:1 Score:[242.]
Episode:1 Score:[293.]


In [25]:
env.close()

# 6. Viewing Logs in Tensorboard

In [30]:
training_log_path = os.path.join(log_path, 'PPO_2')

In [31]:
!tensorboard --logdir={training_log_path}

  import pkg_resources
TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.20.0 at http://localhost:6006/ (Press CTRL+C to quit)
W0925 10:19:44.546509 6193557504 application.py:559] path /apple-touch-icon-precomposed.png not found, sending 404
W0925 10:19:44.694894 6126252032 application.py:559] path /apple-touch-icon.png not found, sending 404
^C


# 7. Adding a callback to the training Stage

In [32]:
# Early stopping callback to stop training when a certain reward threshold is reached
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [33]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [34]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])

In [35]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1) # Stop training when the agent reaches the reward threshold of 190
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1) # Evaluate the agent every 10,000 steps and save the best model

In [36]:
# Create the model
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [37]:
# Train the model with the evaluation callback
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 6346 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 4241        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008642239 |
|    clip_fraction        | 0.0994      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00159     |
|    learning_rate        | 0.0003      |
|    loss                 | 5.46        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 52.6        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=460.00 +/- 80.00
Episode length: 460.00 +/- 80.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 460         |
|    mean_reward          | 460         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.008859875 |
|    clip_fraction        | 0.0829      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.611      |
|    explained_variance   | 0.299       |
|    learning_rate        | 0.0003      |
|    loss                 | 29.8        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0179     |
|    value_loss           | 59.9        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 460.00  is above the threshold 190


<stable_baselines3.ppo.ppo.PPO at 0x142d6cb50>

In [None]:
model_path = os.path.join('Training', 'Saved Models', 'best_model')
model = PPO.load(model_path, env=env)

In [38]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)



(np.float64(420.0), np.float64(119.24344845734713))

In [39]:
env.close()

# 8. Changing Policies

In [None]:
# Custom MLP policy
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])] # Custom architecture with 4 hidden layers of 128 neurons each for both policy and value networks
print(net_arch)

[{'pi': [128, 128, 128, 128], 'vf': [128, 128, 128, 128]}]


In [41]:
model = PPO('MlpPolicy', env, verbose = 1, policy_kwargs={'net_arch': net_arch})

Using cpu device




In [42]:
model.learn(total_timesteps=20000, callback=eval_callback)

-----------------------------
| time/              |      |
|    fps             | 5373 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 3162        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014752792 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.00154    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.81        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.024      |
|    value_loss           | 18.9        |
-----------------------------------------
----------------------------------



Eval num_timesteps=10000, episode_reward=457.40 +/- 74.24
Episode length: 457.40 +/- 74.24
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 457         |
|    mean_reward          | 457         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.011494327 |
|    clip_fraction        | 0.151       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.571      |
|    explained_variance   | 0.491       |
|    learning_rate        | 0.0003      |
|    loss                 | 14.8        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.019      |
|    value_loss           | 41.9        |
-----------------------------------------
------------------------------
| time/              |       |
|    fps             | 2383  |
|    iterations      | 5     |
|    time_elapsed    | 4     

<stable_baselines3.ppo.ppo.PPO at 0x144211350>

# 9. Using an Alternate Algorithm

In [43]:
# Import DQN (Deep Q-Network) algorithm
from stable_baselines3 import DQN

In [44]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [45]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.967    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5018     |
|    time_elapsed     | 0        |
|    total_timesteps  | 70       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.927    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3332     |
|    time_elapsed     | 0        |
|    total_timesteps  | 153      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.492    |
|    n_updates        | 13       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.871    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 

<stable_baselines3.dqn.dqn.DQN at 0x142d22190>

In [46]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [47]:
model.save(dqn_path)

In [None]:
model = DQN.load(dqn_path, env=env)

In [48]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(np.float64(21.1), np.float64(4.635730794599704))

In [49]:
env.close()