<a href="https://colab.research.google.com/github/shubham3796/ML_Introductory/blob/master/working_ppo_custom_env.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x
!pip install stable-baselines[mpi]==2.10.2
!pip install stable_baselines3
import stable_baselines3

import numpy as np
import gym
from gym import spaces
from stable_baselines.common.env_checker import check_env
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

class GoLeftEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}
  # Define constants for clearer code
  LEFT = 0
  RIGHT = 1

  def __init__(self, grid_size=10):
    super(GoLeftEnv, self).__init__()

    # Size of the 1D-grid
    self.grid_size = grid_size
    # Initialize the agent at the right of the grid
    self.agent_pos = grid_size - 1

    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions, we have two: left and right
    n_actions = 2
    self.action_space = spaces.Discrete(n_actions)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                        shape=(1,), dtype=np.float32)

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the right of the grid
    self.agent_pos = self.grid_size - 1
    # here we convert to float32 to make it more general (in case we want to use continuous actions)
    return np.array([self.agent_pos]).astype(np.float32)

  def step(self, action):
    if action == self.LEFT:
      self.agent_pos -= 1
    elif action == self.RIGHT:
      self.agent_pos += 1
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # Are we at the left of the grid?
    done = bool(self.agent_pos == 0)

    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("." * self.agent_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_pos))

  def close(self):
    pass






# Validate the environment
env = GoLeftEnv()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)



In [2]:
# Test the environment

env = GoLeftEnv(grid_size=10)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = 0
# Hardcoded best agent: always go left!
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

.........x.
Box(0.0, 10.0, (1,), float32)
Discrete(2)
0
Step 1
obs= [8.] reward= 0 done= False
........x..
Step 2
obs= [7.] reward= 0 done= False
.......x...
Step 3
obs= [6.] reward= 0 done= False
......x....
Step 4
obs= [5.] reward= 0 done= False
.....x.....
Step 5
obs= [4.] reward= 0 done= False
....x......
Step 6
obs= [3.] reward= 0 done= False
...x.......
Step 7
obs= [2.] reward= 0 done= False
..x........
Step 8
obs= [1.] reward= 0 done= False
.x.........
Step 9
obs= [0.] reward= 1 done= True
x..........
Goal reached! reward= 1


In [3]:
#Once your environment follow the gym interface, it is quite easy to plug in any algorithm from stable-baselines
from stable_baselines import PPO2
from stable_baselines.common.cmd_util import make_vec_env

# Instantiate the env
env = GoLeftEnv(grid_size=10)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

In [20]:
#Train the agent
#model = ACKTR('MlpPolicy', env, verbose=1).learn(5000)
model = PPO("MlpPolicy", env, verbose=1)
n_steps = 25000
model.learn(total_timesteps=n_steps)
model.save("GoLeftEnv")

del model # remove to demonstrate saving and loading

model = PPO.load("GoLeftEnv")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 106      |
|    ep_rew_mean     | 1        |
| time/              |          |
|    fps             | 1573     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 59.1        |
|    ep_rew_mean          | 1           |
| time/                   |             |
|    fps                  | 1146        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.017297283 |
|    clip_fraction        | 0.229       |
|    clip_range           | 0.2         |
|    entropy_loss   

In [21]:
# Test the trained agent
obs = env.reset()

for step in range(n_steps):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
.x.........
x..........
x..........
x..........
x..........
.x.........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........
x..........