In [None]:
!pip install 'stable-baselines3[extra]'
!pip install gymnasium

In [2]:
import os
import gymnasium as gym # https://gymnasium.farama.org/

from IPython import display
from stable_baselines3 import PPO # first algorithm - https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
from stable_baselines3.common.vec_env import DummyVecEnv # vectorise environments
from stable_baselines3.common.evaluation import evaluate_policy # easier to test how the model is actually performing

Learnings so far: gym is now called gymnasium, make sure you use '' around `stable-baselines3[extra]` when using pip, CartPole-v0 is deprecated, returned 5 values not 4 (done is deprecated, truncated is added), render_mode is set in the gym.make method.

In [3]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode="human")

In [None]:
episodes =  5 # Think of an episode as one full game within the environment. Some environments have a fixed length. CartPole is 200 frames Others are continuous, eg, play until not more lives left
for episode in range(1, episodes+1):
    state = env.reset()
    terminated = False
    truncated = False
    score = 0
    
    # Action Space - actions you can take in your environment
    # Observation Space - what your observations look like, a partial view. Here we have a Box environment
    
    while not terminated or not truncated:
#         env.render()
        action = env.action_space.sample() # It's a discrete two action space, 0 or 1 Discrete(2)
        n_state, reward, terminated, truncated, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.close()

In [None]:
env.action_space

Action space = what you can do. Here's its discrete 2 - you can move the cart left or right (1 or 0)
Observation space = what you can see. Here it's Box 4 - values representing the pole and the box

Type of Algorithms - there's a ton

Model free only uses current state values to make a prediction - model based tries to figure out a model of the environment


Model vs Model Free: learning based on predicitons of next state/reward or real samples

Stable baselines (what we are working with) focuses on Model Free 

Algorithm choice can be based on the action space - PPO works on all, but some for example TD3 only works on Box

Treat these algorithms as commodities - good to know in detail how they are put together and/or how they work

Training metrics - what you get back during training

Evaluation - all to do with episode length and reward
Time - fps, iterations, time_elapsed, timesteps
Loss - 
and 
Other explained variance - variance in the env your agent can explain

You need pytorch for GPU acceleration

In [8]:
log_path = os.path.join('Training', 'Logs')

In [None]:
log_path

In [None]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000)

In [None]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [None]:
model.save(PPO_Path)

In [None]:
del model

In [None]:
model = PPO.load(PPO_Path, env=env)

el_len_mean = how long it lasted
el_rew_mean = how much reward it got

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

In [None]:
episodes =  5 # Think of an episode as one full game within the environment. Some environments have a fixed length. CartPole is 200 frames Others are continuous, eg, play until not more lives left
for episode in range(1, episodes+1):
    obs, info = env.reset()
    terminated = False
    truncated = False
    score = 0
    
    # Action Space - actions you can take in your environment
    # Observation Space - what your observations look like, a partial view. Here we have a Box environment
    
    while not terminated:
        env.render()
        action, next_state = model.predict(obs) # Now using model here
        obs, reward, terminated, truncated, info = env.step(action)
        score += reward
        if truncated:
            break
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.close()

7. Viewing Logs in Tensorboard

In [None]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [None]:
!tensorboard --logdir={training_log_path}

Core metric to pay attention too is average reward. this is an indication of how well the model is going to perform in that environment using that reward function. Second is an average episode length (this is important in continuous environment). 

To get a better model:
1. Train for Longer
2. Hyperparameter Tuning

8. Adding Callbacks

In [4]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [5]:
save_path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [6]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, verbose=1, best_model_save_path=save_path)

In [9]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [10]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.3     |
|    ep_rew_mean     | 22.3     |
| time/              |          |
|    fps             | 45       |
|    iterations      | 1        |
|    time_elapsed    | 45       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.4        |
|    ep_rew_mean          | 28.4        |
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 90          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008685252 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00316     |



Eval num_timesteps=10000, episode_reward=460.80 +/- 78.40
Episode length: 460.80 +/- 78.40
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 461         |
|    mean_reward          | 461         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.008257891 |
|    clip_fraction        | 0.0779      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.608      |
|    explained_variance   | 0.238       |
|    learning_rate        | 0.0003      |
|    loss                 | 28          |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0188     |
|    value_loss           | 64.5        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 460.80  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x1600e09a0>

9. Changing Policies

In [16]:
net_arch = dict(pi=[128,128,128,128], vf=[128,128,128,128])

In [17]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.3     |
|    ep_rew_mean     | 21.3     |
| time/              |          |
|    fps             | 45       |
|    iterations      | 1        |
|    time_elapsed    | 44       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.4        |
|    ep_rew_mean          | 28.4        |
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 90          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014556032 |
|    clip_fraction        | 0.25        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.68       |
|    explained_variance   | 0.00475     |

New best mean reward!
Stopping training because the mean reward 500.00  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x161fd4280>