## Import dependencies

In [2]:
!pip install stable-baselines3[extra]
!pip install pyglet



In [4]:
import os
import gym # Work with env
from stable_baselines3 import PPO # Algorithm 
from stable_baselines3.common.vec_env import DummyVecEnv # 
from stable_baselines3.common.evaluation import evaluate_policy # Evaluate policy

import time

## Load Environment

In [7]:
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [8]:
episodes = 5
for episode in range(1,episodes+1):
  state = env.reset()
  done = False
  score = 0
  while not done:
    env.render()
    action = env.action_space.sample()
    n_state, reward, done, info = env.step(action)
    score += reward
  time.sleep(5)
  
  print("Episode: {} Score:{}".format(episode, score))
env.close()

Episode: 1 Score:14.0
Episode: 2 Score:17.0
Episode: 3 Score:12.0
Episode: 4 Score:14.0
Episode: 5 Score:20.0


## Train an RL Model

In [4]:
# Directory need to exist
log_path = os.path.join('RL','logs') 
print(log_path)

RL/logs


In [7]:
PPO??

[0;31mInit signature:[0m
[0mPPO[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpolicy[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mType[0m[0;34m[[0m[0mstable_baselines3[0m[0;34m.[0m[0mcommon[0m[0;34m.[0m[0mpolicies[0m[0;34m.[0m[0mActorCriticPolicy[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mgym[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mEnv[0m[0;34m,[0m [0mstable_baselines3[0m[0;34m.[0m[0mcommon[0m[0;34m.[0m[0mvec_env[0m[0;34m.[0m[0mbase_vec_env[0m[0;34m.[0m[0mVecEnv[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mCallable[0m[0;34m[[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;36m0.0003[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_steps[0m[0;34m:[0m [0mint[0m [0

In [5]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [6]:
model.learn(total_timesteps=20000)

Logging to RL/logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 3232 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 2479       |
|    iterations           | 2          |
|    time_elapsed         | 1          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00818749 |
|    clip_fraction        | 0.0949     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | -0.0166    |
|    learning_rate        | 0.0003     |
|    loss                 | 7.48       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.013     |
|    value_loss           | 52.8       |
----------------------------------------
---------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f2e8c0c1130>

## Save & Reload Model

In [7]:
PPO_Path = os.path.join('RL', 'saved_models', 'PPO_Model_Cartpole')
model.save(PPO_Path)

In [8]:
del model

In [9]:
model = PPO.load(PPO_Path, env=env)

## Evaluation

In [10]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

In [11]:
env.close()

## Testing Model

In [19]:
episodes = 5
for episode in range(1,episodes+1):
  obs = env.reset()
  done = False
  score = 0
  while not done:
    env.render()
    action, _ = model.predict(obs)
    obs, reward, done, info = env.step(action)
    score += reward
  
  print("Episode: {} Score:{}".format(episode, score))
#env.close()

Episode: 1 Score:[200.]
Episode: 2 Score:[200.]
Episode: 3 Score:[200.]
Episode: 4 Score:[200.]
Episode: 5 Score:[167.]


In [20]:
env.close()

## Viewing Logs in Tensorboard

In [22]:
training_log_path = os.path.join(log_path, 'PPO_2')
training_log_path

'RL/logs/PPO_2'

In [None]:
!tensorboard --logdir={training_log_path} 
# localhost:6006/

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.9.1 at http://localhost:6006/ (Press CTRL+C to quit)
