## Complete model for cartpole-v0
### Definitions

In [4]:
import os
import gym
import torch

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

log_path = os.path.join('Training', 'Logs')

env_name = 'CartPole-v0'

my_env = gym.make(env_name)

In [5]:
env_name

'CartPole-v0'

### Running environment

In [6]:
episodes = 20
for episode in range(1, episodes + 1):
    state = my_env.reset()
    done = False
    score = 0

    while not done:
        my_env.render()
        action = my_env.action_space.sample()
        n_state, reward, done, info = my_env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
my_env.close()

Episode:1 Score:15.0
Episode:2 Score:70.0
Episode:3 Score:9.0
Episode:4 Score:21.0
Episode:5 Score:17.0
Episode:6 Score:29.0
Episode:7 Score:31.0
Episode:8 Score:10.0
Episode:9 Score:37.0
Episode:10 Score:23.0
Episode:11 Score:44.0
Episode:12 Score:16.0
Episode:13 Score:27.0
Episode:14 Score:39.0
Episode:15 Score:10.0
Episode:16 Score:63.0
Episode:17 Score:33.0
Episode:18 Score:19.0
Episode:19 Score:13.0
Episode:20 Score:23.0


### Training the RL model

In [7]:
my_env = gym.make(env_name)
my_env = DummyVecEnv([lambda:  my_env])
model = PPO('MlpPolicy', my_env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=20000)

Using cpu device
Logging to Training/Logs/PPO_5
-----------------------------
| time/              |      |
|    fps             | 2872 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 2093        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009218934 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.000274   |
|    learning_rate        | 0.0003      |
|    loss                 | 7.55        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0158     |
|    value_loss           | 51.1        |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f5cd9e27430>

### Evaluating the model

In [8]:
evaluate_policy(model, my_env, n_eval_episodes=5, render=True)



(200.0, 0.0)

In [9]:
my_env.close()

### Running the model

In [10]:
episodes = 5
for episode in range(1, episodes + 1):
    obs = my_env.reset()
    done = False
    score = 0
    while not done:
        my_env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = my_env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))


Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [11]:
my_env.close()

### Viewing Logs on Tensorboard

In [None]:
training_log_path = os.path.join(log_path, 'PPO_5')
!tensorboard --logdir={training_log_path} --load_fast=True