## Imports

In [1]:
import os
from pathlib import Path
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## Load Environment

In [4]:
# load the env from gym
env_name = "CartPole-v0"
env = gym.make(env_name)

In [5]:
# test env by using random valid actions from the action space
nb_episodes = 5
for ep in range(1, nb_episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        
    print(f"episode: {ep},\tscore:{score}")


episode: 1,	score:25.0
episode: 2,	score:26.0
episode: 3,	score:27.0
episode: 4,	score:13.0
episode: 5,	score:14.0


In [6]:
env.close()

## Train Model

In [7]:
# prepare tensorboard log
log_path = os.path.join("../training", "logs")
Path(log_path).mkdir(parents=True, exist_ok=True)

In [8]:
# create env and agent
env = gym.make(env_name)#, render_mode=None)
env = DummyVecEnv([lambda: env])
model = PPO(policy="MlpPolicy", env=env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [9]:
# train
model.learn(total_timesteps=20000)

Logging to ../training\logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 1866 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1166        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.003419831 |
|    clip_fraction        | 0.0823      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 1.73e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.35        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.012      |
|    value_loss           | 56.2        |
-----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x1530a3e7c40>

## Save and Reload Model

In [10]:
# prepare save location
save_model_path = os.path.join("../training", "saved_models", "ppo_cart_pole")
Path(save_model_path).mkdir(parents=True, exist_ok=True)

In [11]:
# save model
model.save(save_model_path)

In [12]:
# remove model from memory
del model