## Imports

In [1]:
import os
from pathlib import Path
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## Load Environment

In [2]:
# load the env from gym
env_name = "CartPole-v0"
env = gym.make(env_name)

In [3]:
# test env by using random valid actions from the action space
nb_episodes = 5
for ep in range(1, nb_episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        
    print(f"episode: {ep},\tscore:{score}")


episode: 1,	score:13.0
episode: 2,	score:14.0
episode: 3,	score:37.0
episode: 4,	score:20.0
episode: 5,	score:12.0


In [4]:
env.close()

## Train Model

In [5]:
# prepare tensorboard log
log_path = os.path.join("../training", "logs")
Path(log_path).mkdir(parents=True, exist_ok=True)

In [6]:
# create env and agent
env = gym.make(env_name)#, render_mode=None)
env = DummyVecEnv([lambda: env])
model = PPO(policy="MlpPolicy", env=env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [7]:
# train
model.learn(total_timesteps=20000)

Logging to ../training\logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 1938 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 1205      |
|    iterations           | 2         |
|    time_elapsed         | 3         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0126093 |
|    clip_fraction        | 0.115     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.686    |
|    explained_variance   | 0.0022    |
|    learning_rate        | 0.0003    |
|    loss                 | 7.21      |
|    n_updates            | 10        |
|    policy_gradient_loss | -0.0165   |
|    value_loss           | 46.3      |
---------------------------------------
------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x205ec7abee0>

## Save and Reload Model

In [8]:
# prepare save location
save_models_path = os.path.join("../training", "saved_models")
Path(save_models_path).mkdir(parents=True, exist_ok=True)
ppo_save_path = os.path.join(save_models_path, "ppo_cart_pole")

In [9]:
# save model
model.save(ppo_save_path)

In [10]:
# remove model from memory
del model

In [11]:
# reload model from saved files
model = PPO.load(ppo_save_path, env=env)