Importing Dependencies

In [1]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy 

Loading Environment

In [2]:
environment_name = "CartPole-v0"
env = gym.make(environment_name)

In [3]:
environment_name

'CartPole-v0'

In [4]:
episodes = 5 # test the environment 5 times
for episodes in range(1,episodes+1): # looping through each episodes
    stable = env.reset() # resetting the environment # here you will get set of observations
  # tells that which kind of the action is best for our work
    done = False 
    score = 0
    while not done:
        env.render() # to view the graphical view of the environment # does not work in google colab
        action = env.action_space.sample() # generating the random action # discrete space action Discrete(2)
        # for observation space we get the box environment
        n_state, reward, done, info = env.step(action)  # we get next set of obs, the reward 1 inc 0 dec and -1 is dec 
        #the boolean value whether the episode done or not 
        score += reward
    print('Episode:{},Score:{}'.format(episodes,score))

Episode:1,Score:17.0
Episode:2,Score:32.0
Episode:3,Score:20.0
Episode:4,Score:16.0
Episode:5,Score:11.0


In [5]:
env.close() # close the environment

Understanding Environment

There are two kinds of environment - action space and observation space

In [6]:
env.action_space

Discrete(2)

In [7]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [8]:
env.action_space.sample()

1

In [9]:
env.observation_space.sample()

array([-1.6069075e+00,  3.2598491e+38,  1.5562764e-01,  6.3927645e+36],
      dtype=float32)

Training

There are twon kinds of reinforcement learning algorithms -  Model Based and Model Free Reinforcement Learning Algorithm.
Here, we are using model free based RL algorithm.
Model Free - Makes predictions on current state of the model
Model based  - Makes predictions on future state of the model

In [10]:
# define a log path where we are going to save out tensorboard log 
# make your directories first
log_path = os.path.join('Training','Logs')

In [11]:
log_path

'Training\\Logs'

In [12]:
# instatiate the algorithm
env = gym.make(environment_name) # created an environment
env = DummyVecEnv([lambda: env]) # wrap the env into the dummy vector
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path) # defining an agent or model
# policy in proj and proj 2, we will use CNN policy
# It supports mlp and cnn policy
# MLP - Multi Layer Perceptron Policy
# MLP, LSTM and CNN policy are avalilave in Stalbe baseline
#dummy vector

Using cpu device


In [13]:
#PPO??
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy 
model.learn(total_timesteps=20000) # how long one wants to train the model

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 2241 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1346        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007869525 |
|    clip_fraction        | 0.089       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.00219     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.78        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0133     |
|    value_loss           | 57.6        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x10ecbbd6c48>

Saving and Reloading The Environments

In [14]:
PPO_path = os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [15]:
model.save(PPO_path)

In [16]:
del model

In [17]:
model = PPO.load(PPO_path,env=env)

In [18]:
model.learn(total_timesteps=1000)

Logging to Training\Logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 1965 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x10ecc56ef88>

Evaluation of RL Agent

In [19]:
# here we are evalua6ing our policy
evaluate_policy(model,env,n_eval_episodes=10,render=True) # how many episodes you want to evlauate
# ppo model gets solved if you get the avg score of 200 and higher
# a model is 200 on average to determine whether the environment is actually solved
# certain environments have gap whether it is solved other is contineous whether the high score is the best
# 200 avg revard in the episodes
# 0 is the standard deviation



(200.0, 0.0)

In [20]:
env.close()

Test Model

In [21]:
# Testing our model
episodes = 5 # test the environment 5 times
for episodes in range(1,episodes+1): # looping through each episodes
    obs = env.reset() # observation space
    # Taking the obs and passing it through our model
  # tells that which kind of the action is best for our work
    done = False 
    score = 0
    while not done:
        env.render()
        action, _ = model.predict(obs) # now using model here # returns model action and next state
        # take that action to get the best reward
        # for observation space we get the box environment
        # rather than getting random action we are using model.predict(obs) on our obs for an curr env to gen the action inorder to get best possible reward
        obs, reward, done, info = env.step(action)  # gies state, reward whose value is 1
        # reward is 1 for every step including the termination step
        score += reward
    print('Episode:{},Score:{}'.format(episodes,score))

Episode:1,Score:[200.]
Episode:2,Score:[200.]
Episode:3,Score:[200.]
Episode:4,Score:[200.]
Episode:5,Score:[200.]


In [22]:
env.close() # Remark - model performs better than before

view the tensorboard logs

In [23]:
# to view the tensorboard logs for training in the spophesicated environment
training_log_path = os.path.join(log_path,'PPO_1')

In [24]:
training_log_path

'Training\\Logs\\PPO_1'

In [30]:
!tensorboard --logdir={training_log_path} 
# ! magic command used to run commandline commands in jupyter netebook
# or you can use this in command prompt also

^C


Adding A Call Back To The Trainign Stage

In [31]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
# EvalCallback - runs during our training stage 
# StopTrainingOnRewardThreshold - think like a checker - once our modle passes a certain reward threshold we will stop the training of the RL model

In [32]:
save_path = os.path.join('Training','Saved Models')

In [33]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)
# trigger after each particular run
eval_callback = EvalCallback(env, # environment
                            callback_on_new_best=stop_callback, # everytime when it realizses new best model, 
                            # it will run stop callback
                            eval_freq=10000, 
                            best_model_save_path =save_path, # need to speicfy best model is going to be
                            verbose=1)
# save the model everytime we get the best model
# after ebvery 10000 steps it will check whether it passes the threshold - if it passes the threshold the training of 
# RL model will be stopped

In [34]:
# create new PPO model and assign these callbacks
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)
model.learn(total_timesteps=20000,callback=eval_callback)

Using cpu device
Logging to Training\Logs\PPO_8
-----------------------------
| time/              |      |
|    fps             | 2035 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1268        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007932501 |
|    clip_fraction        | 0.0992      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.000488    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.54        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0161     |
|    value_loss           | 49.7        |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x10ed30c4e08>

Change Policy

In [35]:
# use different neural network architecture - change policy
# this is a kin of changing no of units and no of layers inside a neural network
net_arch = [dict(pi=[128,128,128,128],vf=[128,128,128,128])]
# first nn arch is defined for custom actor -  pass thorugh pi and we have new neural network with 128 units in each of the layers
# same ofr value function 128 for each layers of neural network

In [36]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path,policy_kwargs={'net_arch':net_arch})

Using cpu device


In [37]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_9
-----------------------------
| time/              |      |
|    fps             | 1106 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 668         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014690958 |
|    clip_fraction        | 0.237       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.0109     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.54        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0278     |
|    value_loss           | 18.5        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x10ed2ff3a08>

Using an Alternate Algorithm

In [38]:
# here we are using DQN algo
from stable_baselines3 import DQN
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [39]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.95     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6307     |
|    time_elapsed     | 0        |
|    total_timesteps  | 105      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.886    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7489     |
|    time_elapsed     | 0        |
|    total_timesteps  | 239      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.835    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 7454     |
|    time_elapsed     | 0        |
|    total_timesteps  | 347      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 6667     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2290     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 6562     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2374     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 6555     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2466     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 6498     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4771     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 6514     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4854     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 6525     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4941     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 6891     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7203     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 6916     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7306     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 6966     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7477     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 7237     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9694     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 7251     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9786     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 7277     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9928     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 7050     |
|    time_elapsed     | 1        |
|    total_timesteps  | 12111    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 7056     |
|    time_elapsed     | 1        |
|    total_timesteps  | 12196    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 7076     |
|    time_elapsed     | 1        |
|    total_timesteps  | 12330    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 7249     |
|    time_elapsed     | 2        |
|    total_timesteps  | 14712    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 7246     |
|    time_elapsed     | 2        |
|    total_timesteps  | 14804    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 7250     |
|    time_elapsed     | 2        |
|    total_timesteps  | 14890    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 7228     |
|    time_elapsed     | 2        |
|    total_timesteps  | 17189    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 7232     |
|    time_elapsed     | 2        |
|    total_timesteps  | 17293    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 7237     |
|    time_elapsed     | 2        |
|    total_timesteps  | 17451    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 7195     |
|    time_elapsed     | 2        |
|    total_timesteps  | 19581    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 7198     |
|    time_elapsed     | 2        |
|    total_timesteps  | 19653    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 7196     |
|    time_elapsed     | 2        |
|    total_timesteps  | 19711    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x10ed30e4388>

In [40]:
model.save(PPO_path)

In [41]:
m = DQN.load(PPO_path,env=env)