In [1]:
!pip install gym
import gym

Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/9b/50/ed4a03d2be47ffd043be2ee514f329ce45d98a30fe2d1b9c61dea5a9d861/gym-0.10.5.tar.gz (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 7.3MB/s 
Collecting pyglet>=1.2.0 (from gym)
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 10.3MB/s 
Building wheels for collected packages: gym
  Running setup.py bdist_wheel for gym ... [?25l- \ | / done
[?25h  Stored in directory: /content/.cache/pip/wheels/cb/14/71/f4ab006b1e6ff75c2b54985c2f98d0644fffe9c1dddc670925
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.10.5 pyglet-1.3.2
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [2]:
import numpy as np
import random
import tensorflow as tf
# tf.enable_eager_execution()

seed_val = 111
np.random.seed(seed_val)
random.seed(seed_val+1)
tf.set_random_seed(seed_val+2)
env.seed(seed_val+3)

[114]

In [4]:
import multiprocessing as mp
import os
os.system("taskset -p 0xff %d" % os.getpid())

0

In [0]:
def preprocess(img):
    return np.reshape(img, (1,4))

def discounted_rewards(rewards, dones, gamma):
    discounted = []
    r = 0
    for reward, done in zip(rewards[::-1],dones[::-1]): #revert the rewards to start discounting from behind
        r = reward + gamma*r*(1.-done) #return = (reward) if done else (reward + gamma*return_tplus1)
        discounted.append(r)
    return discounted[::-1] #revert the discounts to normal order

In [0]:
def worker(remote, env_fun):
    env = env_fun
    while True:
        cmd, data = remote.recv()
        if cmd == 'get_spaces':
            remote.send((env.action_space, env.observation_space))
        elif cmd == 'step':
            ob, rew, done, info = env.step(data)
            if done:
                ob = env.reset()
            remote.send((ob, rew, done))
        elif cmd == 'reset':
            ob = env.reset()
            remote.send(ob)
        else:
            raise NotImplementedError

class SubProcVecEnv():
    """
    :param: env_funcs - list of agent environment functions
    """
    def __init__(self, env_funcs):
        self.nenvs = len(env_funcs)
        self.remotes, self.work_remotes = zip(*[mp.Pipe() for _ in range(self.nenvs)])
        self.ps = [mp.Process(target=worker, args=(work_remote, env_fn))
                   for (work_remote, env_fn) in zip(self.work_remotes, env_funcs)]
        for p in self.ps:
            p.start()
            
        self.remotes[0].send(('get_spaces', None)) # Ask about the env space details
        self.action_space, self.observation_space = self.remotes[0].recv()
        
    def step(self, actions):
        assert len(actions) == len(self.remotes)
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
            
        results = [remote.recv() for remote in self.remotes]
        obs, rewards, dones = zip(*results)
        return np.stack(obs), np.stack(rewards), np.stack(dones)
    
    def reset(self):
        for remote in self.remotes:
            remote.send(('reset', None))
            
        obs = [remote.recv() for remote in self.remotes]
        
        return np.stack(obs)

In [0]:
class A2C():
    def __init__(self, num_envs):
        ## setup model
        ## setup environment
        self.nenv = num_envs
        self.env = self.makeallenvironments(num_envs)
        self.env.reset()
    
    @staticmethod
    def makeallenvironments(num_envs=4):
        return SubProcVecEnv([gym.make('CartPole-v0') for _ in range(num_envs)])

In [0]:
class Policy():
    def __init__(self, sess, obs_space, ac_space, nbatch, nsteps=1, reuse=False):
        self.sess = sess
        input_shape = (nbatch,) + obs_space.shape
        num_actions = agent.env.action_space.n
        
        self.X_input = tf.placeholder(tf.float32, input_shape, name="Ob") #input observation state
        
        with tf.variable_scope("model", reuse=reuse):
            h = self._build_model(self.X_input)
            
            self.policy_logits = tf.layers.dense(h, units=num_actions, name="policylogits")
            self.value_fn = tf.layers.dense(h, units=1, name="valuefn")
            action = tf.squeeze(tf.multinomial(logits=self.policy_logits, num_samples = 1))
            
        def select_action(obs):
            a, vf = self.sess.run((action, self.value_fn), {self.X_input: obs})
            return a, vf
        
        def value(obs):
            v = self.sess.run(self.value_fn, {self.X_input: obs})
            return v
            
        self.select_action = select_action
        self.value = value
    
    def _build_model(self, X_input):
        h1 = tf.layers.dense(X_input, units=100, activation=tf.nn.relu, name="layer1")
        h2 = tf.layers.dense(X_input, units=10, activation=tf.nn.relu, name="layer2")
        return h2


In [0]:
class Runner:
    def __init__(self, env, policy_model, nsteps=5, gamma=0.99):
        self.env = env
        self.model = policy_model
        self.gamma = gamma
        nenvs = env.nenvs
        self.batch_ob_shape = (nenvs * nsteps,) + env.observation_space.shape
        self.nsteps = nsteps
        self.initial_state = None
        self.states = self.initial_state
        # Init env variables
        self.obs = np.zeros((nenvs,) + env.observation_space.shape, dtype=np.float32)
        self.dones = [False for _ in range(nenvs)]
    
    def run(self):
        mb_states = self.states
        mb_obs, mb_actions, mb_rewards, mb_values, mb_dones = [], [], [], [], []
        for n in range(self.nsteps):
            # get the actions to take a step
            actions, values = self.model.select_action(self.obs)
            mb_obs.append(np.copy(self.obs)) #start states
            mb_actions.append(actions) #actions
            mb_values.append(values) #Value of the states
            mb_dones.append(self.dones) #Done status of the episode before taking action
            
            # take a step
            obs, rewards, dones = self.env.step(actions)
            self.dones = dones
            mb_rewards.append(rewards)
            
            # check which agents returned with done=True
            for n, done in enumerate(dones):
                if done:
                    obs[n] = obs[n] * 0 # Reset obs to zeros if action led to completion of episode
            self.obs = obs
        
        mb_dones.append(self.dones) #last done stores whether episode ended after taking the last action
        mb_obs = np.asarray(mb_obs).swapaxes(1,0).reshape(self.batch_ob_shape) #first n rows are first obs of n agents and so on..
        mb_rewards = np.asarray(mb_rewards).swapaxes(1,0)
        mb_actions = np.asarray(mb_actions, dtype=np.uint8).swapaxes(1,0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1,0)
        mb_values = np.asarray(mb_values).swapaxes(1,0)
        mb_masks = mb_dones[:,:-1] # Stores done values of episode before the action is taken
        mb_dones = mb_dones[:,1:] # Stores done values of episode after the action is taken
        last_values = self.model.value(self.obs).tolist()
        
        # calculate returns for each agent
        for n, (rewards, done, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist() #helps in appending list later on
            dones = done.tolist()
            if dones[-1] == 0: #if last action led to end of episode, use last_value to estimate return
                rewards = discounted_rewards(rewards + value, dones + [False], self.gamma)[:-1] #removing the return corresponding to value as it was just needed to calculate other returns
            else: #rollout complete
                rewards = discounted_rewards(rewards, dones, self.gamma)
            mb_rewards[n] = rewards #mb_rewards now stores discounted returns rather than just rewards
        
        mb_actions = mb_actions.flatten()
        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        
        return mb_obs, mb_actions, mb_rewards, mb_values, mb_masks

In [0]:
class Model:
    def __init__(self, sess, obs_space, ac_space, nenvs=3, nsteps=5):
        nbatch = nenvs * nsteps # batch size. nenvs: no. of agents, nsteps: no. of steps to be taken by an agent
        A = tf.placeholder(tf.int32, [nbatch])# actions
        ADV = tf.placeholder(tf.float32, [nbatch]) # advantage
        R = tf.placeholder(tf.float32, [nbatch]) # returns
#         LR = # learning rate
        
        step_model = Policy(sess, obs_space, ac_space, nenvs, nsteps=1, reuse=False)
        train_model = Policy(sess, obs_space, ac_space, nbatch=nenvs*nsteps, nsteps=nsteps, reuse=True)
        
        # get loss
#         A_onehot = tf.one_hot(A, depth=2)
        neglogp_ac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy_logits, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogp_ac)
        vf_loss = tf.reduce_mean(tf.squared_difference(tf.squeeze(train_model.value_fn), R))
#         entropy = tf.reduce_mean(tf.softmax())
        loss = pg_loss + vf_loss * 0.5
        
        # get grads
        params = tf.trainable_variables(scope="model")
        grads = tf.gradients(loss, params)
        grads, grad_norm = tf.clip_by_global_norm(grads, clip_norm=0.5)
        grads_and_vars = list(zip(grads, params))
        # apply grads
        trainer = tf.train.RMSPropOptimizer(learning_rate=0.001)
        _train = trainer.apply_gradients(grads_and_vars)
        # update learning rate
        
        def train(obs, actions, returns, values, masks):
            advs = returns - values
            td_map = {A: actions, train_model.X_input: obs, ADV: advs, R: returns}
            policy_loss, value_loss, _ = sess.run([pg_loss, vf_loss, _train], td_map)
            return policy_loss, value_loss
        
        self.train = train
        self.select_action = step_model.select_action #Give access to Policy class methods so that Runner can use them
        self.value = step_model.value 
        self.step_model = step_model
        self.train_model = train_model

# Initialize Session and Classes

In [25]:
tf.reset_default_graph() 
sess = tf.Session()
agent = A2C(num_envs=3)
model = Model(sess, nenvs=agent.nenv, nsteps=5, obs_space=agent.env.observation_space, ac_space=agent.env.action_space)
init = tf.global_variables_initializer()
sess.run(init)
runner = Runner(env=agent.env, policy_model=model, nsteps=5)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


# Train

In [26]:
for update in range(int(1e4)):
    mb_obs, mb_actions, mb_returns, mb_values, mb_masks = runner.run()
    policy_loss, value_loss = model.train(mb_obs, mb_actions, mb_returns, mb_values, mb_masks)
    if update % 100 == 0:
        print(update, policy_loss, value_loss, max(mb_returns))

0 2.0814865 10.76047 4.846010781352588
100 2.2015047 10.429535 4.952228981825924
200 1.7732319 8.841066 5.269997582357253
300 1.4925781 6.050584 5.746770421736947
400 2.126329 10.914888 6.14216848078202
500 1.7906512 8.629291 6.707398127504541
600 1.3612577 7.3886385 6.654279523937549
700 1.7631238 10.254335 7.343393514855129
800 2.1942298 10.100694 8.160577580737689
900 1.1893611 7.4110503 8.356927828183547
1000 1.3835261 11.233779 9.903813710716552
1100 1.7351397 9.980422 10.275102835718943
1200 0.33484414 10.652118 10.392599864951679
1300 1.7845998 8.589515 11.311508457208038
1400 1.541416 8.521596 11.926182147846248
1500 -0.15182273 12.427348 12.876987636517347
1600 -1.0193843 19.525433 13.343023331652
1700 1.9802113 8.640136 15.618260604939469
1800 1.6983973 7.971852 16.247355012138456
1900 1.6486589 7.529094 17.08967160133626
2000 1.6100909 7.750879 18.184345515472668
2100 1.6157478 7.650494 18.708125748415867
2200 1.9313197 9.985378 19.84891457560318
2300 -0.6559055 46.526306 20

6800 0.6477199 1.371029 61.60565953190271
6900 0.752258 1.5933479 62.68654440955661
7000 0.6369572 1.4801755 63.6614303771915
7100 0.6401216 1.0358013 65.3443966094509
7200 0.61413115 1.2463948 66.40852858774245
7300 0.46533698 0.74049455 67.55147682665464
7400 0.5215283 1.2279035 69.64236055910843
7500 0.46475324 0.7742249 70.52580210616593
7600 0.4065706 0.59356594 71.5878227402763
7700 0.52943254 0.8862467 72.95894851580427
7800 0.4139941 0.5564574 74.47543054331359
7900 0.45541364 0.75134075 75.79546324075167
8000 0.46109068 0.6137286 77.4727266670783
8100 -9.184757 1505.3105 79.04075161233119
8200 0.34591994 0.4018933 80.12651942687155
8300 0.27118397 0.3255874 81.35517663077603
8400 0.41612715 0.5381705 82.8743069078597
8500 -13.64532 1223.5844 85.84861767641445
8600 0.3028769 0.36651742 87.54741536229385
8700 0.2713418 0.21657147 88.23724447128998
8800 -2.5510657 1023.48175 89.29754399209595
8900 0.09270546 0.05999767 91.89314212550799
9000 0.111214854 0.048658468 93.20882153597

# List of Trainable Variables in the Model

In [22]:
tf.trainable_variables()

[<tf.Variable 'model/layer1/kernel:0' shape=(4, 100) dtype=float32_ref>,
 <tf.Variable 'model/layer1/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'model/layer2/kernel:0' shape=(4, 10) dtype=float32_ref>,
 <tf.Variable 'model/layer2/bias:0' shape=(10,) dtype=float32_ref>,
 <tf.Variable 'model/policylogits/kernel:0' shape=(10, 2) dtype=float32_ref>,
 <tf.Variable 'model/policylogits/bias:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'model/valuefn/kernel:0' shape=(10, 1) dtype=float32_ref>,
 <tf.Variable 'model/valuefn/bias:0' shape=(1,) dtype=float32_ref>]

# Test

In [27]:
cur_state = agent.env.reset()
rewards_run = []
for i in range(20):
    total=0
    while True:
        ac, val = model.select_action(cur_state)
        next_state, rewards, dones = agent.env.step(ac)
        total+=rewards[0]
        cur_state = next_state
        if dones[0]==True:
            break
    print(total)
    rewards_run.append(total)
np.mean(rewards_run)

197.0
200.0
198.0
181.0
134.0
200.0
182.0
200.0
188.0
175.0
136.0
197.0
170.0
200.0
152.0
200.0
200.0
122.0
181.0
191.0


180.2