## Imports

In [1]:
import os
from pathlib import Path
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

## Load Environment

In [2]:
# load the env from gym
env_name = "CartPole-v0"
env = gym.make(env_name)

In [3]:
# test env by using random valid actions from the action space
nb_episodes = 5
for ep in range(1, nb_episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        
    print(f"episode: {ep},\tscore:{score}")


episode: 1,	score:10.0
episode: 2,	score:13.0
episode: 3,	score:22.0
episode: 4,	score:25.0
episode: 5,	score:13.0


In [4]:
env.close()

## Train Model

In [5]:
# prepare tensorboard log
log_path = os.path.join("..", "training", "logs")
Path(log_path).mkdir(parents=True, exist_ok=True)

In [6]:
# prepare save location
save_models_path = os.path.join("..", "training", "saved_models")
Path(save_models_path).mkdir(parents=True, exist_ok=True)

In [7]:
# set up callbacks (optional)
# stop when we have 200 reward
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)

# evaluate callbacks periodically.
# when we have a new best model, check the stop callback and save best model
eval_callback = EvalCallback(
    env,
    callback_on_new_best=stop_callback,
    eval_freq=10000,
    best_model_save_path=save_models_path,
    verbose=1
)

In [8]:
# create env and agent
env = gym.make(env_name)#, render_mode=None)
env = DummyVecEnv([lambda: env])
model = PPO(policy="MlpPolicy", env=env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [9]:
# train
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to ..\training\logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 597  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 733          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0050162263 |
|    clip_fraction        | 0.0698       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | -0.0244      |
|    learning_rate        | 0.0003       |
|    loss                 | 8.06         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00956     |
|    value_loss           | 50.3         |
-------------------------



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
New best mean reward!
Stopping training because the mean reward 200.00  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x1c20c98fcd0>

## Save and Reload Model

In [10]:
# prepare save location
ppo_save_path = os.path.join(save_models_path, "ppo_cart_pole")

In [11]:
# save model
model.save(ppo_save_path)

In [12]:
# remove model from memory
del model

In [13]:
# reload model from saved files
model = PPO.load(ppo_save_path, env=env)

## Evaluate Policy

In [14]:
score_avg, score_std = evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()

In [15]:
print(f"Score (avg): {score_avg}")
print(f"Score (std): {score_std}")

Score (avg): 192.1999969482422
Score (std): 17.353961944580078


## Test Model

In [16]:
# test trained model actions in env
nb_episodes = 5
for ep in range(1, nb_episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        
    print(f"episode: {ep},\tscore:{score}")


episode: 1,	score:[154.]
episode: 2,	score:[144.]
episode: 3,	score:[120.]
episode: 4,	score:[68.]
episode: 5,	score:[102.]


In [17]:
env.close()

## Tensorboard Logs

Run the following from the terminal:
`tensorboard --logdir=<log-path>`

See the below cell to get the log path

In [18]:
print(f"tensorboard --logdir=\"{log_path}\"")

tensorboard --logdir="..\training\logs"
