# Installing Dependencies

Stable baselines is an RL library that allows you to work with model free algorithms. Runs on Tensorflow and PyTorch

In [1]:
!pip install stable-baselines3[extra]



In [1]:
import os # operating system library that makes it easier to define our paths to save model as well as where to log out
import gym #for OpenAI gym, allows us to build environments and work with pre-existing environments
from stable_baselines3 import PPO # an algorithm
 #stable baselines allows you to vectorize your environments, making it more practical to train your agent on multiple environments at the same time. Boosts training speed
from stable_baselines3.common.vec_env import DummyVecEnv #not really vectorization, more like a wrapper around env that makes it easier to work with stable baselines
from stable_baselines3.common.evaluation import evaluate_policy #  test out how a model is performing. Gets the average reward over a certain number of episodes

In [3]:
from platform import python_version

print(python_version())

3.7.4


# Load Environment

In [3]:
pip install pyglet

Collecting pyglet
  Using cached https://files.pythonhosted.org/packages/48/c2/5898d5cce5d5ce7e74b5a515f2d107a82f2c4d0d4505c0ca119cb34c6b01/pyglet-1.5.19-py3-none-any.whl
Installing collected packages: pyglet
Successfully installed pyglet-1.5.19
Note: you may need to restart the kernel to use updated packages.


In [5]:
# upload environment
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [9]:
# test out environments
episodes = 5 # test 5 times
for episode in range(1, episodes + 1): # loop through each episode
    state = env.reset()  # reset environment every time theres a new episode, get an initial set of observations
    # these observations are passed to the reinforcement agent to determine best action to maximize reward
    done = False # episode is not done
    score = 0 
    
    #actions will move bar to the left and to the right
    while not done: 
        env.render() # visual representation of env. 
        action = env.action_space.sample() # generate a random action, NOT an action informed by observations
        n_state, reward, done, info = env.step(action) # pass through random action -a forward pass, or in this case, 
        #supply an action to the environment, gets an observation back.
        # get back the next set of observations, the reward for taking the inputted actio)n (positive for increase, negative for decrease (includes 0). 
        #whether episode is done. If done, stop. )
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
#env.close()
    

Episode:1 Score:11.0
Episode:2 Score:27.0
Episode:3 Score:20.0
Episode:4 Score:15.0
Episode:5 Score:35.0


In [8]:
env.reset() 

array([-0.02500592,  0.0207521 , -0.04510446, -0.01304501])

In [5]:
env.action_space # get two different types of actions, either 0 or 1

Discrete(2)

In [6]:
env.action_space.sample() 

1

There are two different spaces within any environment: the action space and the observation space

In [8]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [9]:
env.observation_space.sample() # also randomly outputed

array([ 1.9326792e+00,  1.3847042e+38,  3.3825469e-01, -2.5008390e+38],
      dtype=float32)

# Understanding the Environment

Remember, there's two parts of our environment, actions space and observation space

# What do the action space values mean?

Type = Discrete(2)
0: Push cart to the left, 1: push cart to the right

Reward is 1 for every step taken

In [13]:
env.action_space

Discrete(2)

In [14]:
env.action_space.sample()

0

# What do these observation spaces values mean?

You can take a look at the openAI documentation to find out, but here they are: 

Type: Box(4)
Num 0: Cart Position , from [-4.8 , 4/8]
Num 1: Cart Velicity, (-Inf, Inf)
Num 2: Pole Angle, [-24 degrees, 24 degrees]
Num 3: Pole Angular Velocity, all reals

In [11]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [15]:
env.observation_space.sample()

array([-4.5754972e+00,  2.6312240e+38,  2.7803665e-01,  1.0333603e+38],
      dtype=float32)

# Training our agent for real 

In [2]:
# location to save our tensor board logs: good for monitoring and referencing about how model is performing
log_path = os.path.join('Training', 'Logs') # so, inside the folder that you're working in, creating a folder called Training. The Training folder has another folder inside of it called Logs



In [3]:
log_path

'Training\\Logs'

In [7]:
# instantiate algorithm. Using PPO
env = gym.make(environment_name) 
env = DummyVecEnv([lambda: env]) #wrap environment
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path) # define model, 
#MlpPolicy - using a neural network
# verbose = 1: meaning please log out results

Using cpu device


# You need to look at the documentations

In [10]:
model.learn(total_timesteps=20000) 

Logging to Training\Logs\PPO_2
-----------------------------
| time/              |      |
|    fps             | 2770 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1850         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0067054364 |
|    clip_fraction        | 0.0542       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.548       |
|    explained_variance   | 0.378        |
|    learning_rate        | 0.0003       |
|    loss                 | 85.2         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.0053      |
|    value_loss           | 112          |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x181bb583e48>

# Save and Reload Model

In [11]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [12]:
model.save(PPO_Path)



In [13]:
del model

In [16]:
model.learn(total_timesteps=1000)

Logging to Training\Logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 2784 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x181bb63ca08>

In [15]:
# after deleting, you can actually "recover it" by reloading. Reload it back into memory
model = PPO.load(PPO_Path, env= env)

# Evaluation

For PPO, an evinroment is considered solved if you get on avergae a reward of 200 or higher
Test our model to see how it's performing

In [17]:
# look at the documentation for this, too
evaluate_policy(model, env, n_eval_episodes = 10, render = True)



(200.0, 0.0)

Check out the output: The first coordinate is the average rewards and the second is the STDev


Reward for CartPole is calculated as 1 point for every step that the pole reamins upright (with a max of 200 steps). If the pole is more than 15 degrees from vertical or the cart moves more than 2.4 units from the center the episode ends)

In [18]:
env.close()

# Testing Agent

In [20]:
# test out environments -- uhh what is the difference from what we did before
episodes = 5 # test 5 times
for episode in range(1, episodes + 1): # loop through each episode
    obs = env.reset()  # reset environment every time theres a new episode, get an initial set of observations
    # these observations are passed to the reinforcement agent to determine best action to maximize reward
    done = False # episode is not done
    score = 0 
    
    #actions will move bar to the left and to the right
    while not done: 
        env.render() # visual representation of env. 
        action, _ = model.predict(obs) # NOW USING MODEL HERE
        obs, reward, done, info = env.step(action) # pass through random action -a forward pass, or in this case, 
        #supply an action to the environment, gets an observation back.
        # get back the next set of observations, the reward for taking the inputted actio)n (positive for increase, negative for decrease (includes 0). 
        #whether episode is done. If done, stop. )
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
#env.close()

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]
