In [1]:
import os
import sys
new_path = r'/usr/local/lib/python2.7/dist-packages'
sys.path.append(new_path)

import gym
env = gym.make('CartPole-v0')

In [2]:
import numpy as np
import random
import tensorflow as tf
# tf.enable_eager_execution()

seed_val = 111
np.random.seed(seed_val)
random.seed(seed_val+1)
tf.set_random_seed(seed_val+2)
env.seed(seed_val+3)

  from ._conv import register_converters as _register_converters


[114]

In [3]:
import multiprocessing as mp
os.system("taskset -p 0xff %d" % os.getpid())

0

In [4]:
def preprocess(img):
    return np.reshape(img, (1,4))

def discounted_rewards(rewards, dones, gamma):
    discounted = []
    r = 0
    for reward, done in zip(rewards[::-1],dones[::-1]): #revert the rewards to start discounting from behind
        r = reward + gamma*r*(1.-done) #return = (reward) if done else (reward + gamma*return_tplus1)
        discounted.append(r)
    return discounted[::-1] #revert the discounts to normal order

In [5]:
def worker(remote, env_fun):
    env = env_fun
    while True:
        cmd, data = remote.recv()
        if cmd == 'get_spaces':
            remote.send((env.action_space, env.observation_space))
        elif cmd == 'step':
            ob, rew, done, info = env.step(data)
            if done:
                ob = env.reset()
            remote.send((ob, rew, done))
        elif cmd == 'reset':
            ob = env.reset()
            remote.send(ob)
        else:
            raise NotImplementedError

class SubProcVecEnv():
    """
    :param: env_funcs - list of agent environment functions
    """
    def __init__(self, env_funcs):
        self.nenvs = len(env_funcs)
        self.remotes, self.work_remotes = zip(*[mp.Pipe() for _ in range(self.nenvs)])
        self.ps = [mp.Process(target=worker, args=(work_remote, env_fn))
                   for (work_remote, env_fn) in zip(self.work_remotes, env_funcs)]
        for p in self.ps:
            p.start()
            
        self.remotes[0].send(('get_spaces', None)) # Ask about the env space details
        self.action_space, self.observation_space = self.remotes[0].recv()
        
    def step(self, actions):
        assert len(actions) == len(self.remotes)
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
            
        results = [remote.recv() for remote in self.remotes]
        obs, rewards, dones = zip(*results)
        return np.stack(obs), np.stack(rewards), np.stack(dones)
    
    def reset(self):
        for remote in self.remotes:
            remote.send(('reset', None))
            
        obs = [remote.recv() for remote in self.remotes]
        
        return np.stack(obs)

In [6]:
class A2C():
    def __init__(self, num_envs):
        ## setup model
        ## setup environment
        self.nenv = num_envs
        self.env = self.makeallenvironments(num_envs)
        self.env.reset()
    
    @staticmethod
    def makeallenvironments(num_envs=4):
        return SubProcVecEnv([gym.make('CartPole-v0') for _ in range(num_envs)])

In [7]:
class Policy():
    def __init__(self, sess, obs_space, ac_space, nbatch, nsteps=1, reuse=False):
        self.sess = sess
        input_shape = (nbatch,) + obs_space.shape
        num_actions = agent.env.action_space.n
        
        self.X_input = tf.placeholder(tf.float32, input_shape, name="Ob") #input observation state
        
        with tf.variable_scope("model", reuse=reuse):
            h = self._build_model(self.X_input)
            
            self.policy_logits = tf.layers.dense(h, units=num_actions, name="policylogits")
            self.value_fn = tf.layers.dense(h, units=1, name="valuefn")
            action = tf.squeeze(tf.multinomial(logits=self.policy_logits, num_samples = 1))
            
        def select_action(obs):
            a, vf = self.sess.run((action, self.value_fn), {self.X_input: obs})
            return a, vf
        
        def value(obs):
            v = self.sess.run(self.value_fn, {self.X_input: obs})
            return v
            
        self.select_action = select_action
        self.value = value
    
    def _build_model(self, X_input):
        h1 = tf.layers.dense(X_input, units=50, activation=tf.nn.relu, name="layer1")
        h2 = tf.layers.dense(X_input, units=50, activation=tf.nn.relu, name="layer2")
        return h2


In [8]:
class Runner:
    def __init__(self, env, policy_model, nsteps=5, gamma=0.99):
        self.env = env
        self.model = policy_model
        self.gamma = gamma
        nenvs = env.nenvs
        self.batch_ob_shape = (nenvs * nsteps,) + env.observation_space.shape
        self.nsteps = nsteps
        self.initial_state = None
        self.states = self.initial_state
        # Init env variables
        self.obs = np.zeros((nenvs,) + env.observation_space.shape, dtype=np.float32)
        self.dones = [False for _ in range(nenvs)]
    
    def run(self):
        mb_states = self.states
        mb_obs, mb_actions, mb_rewards, mb_values, mb_dones = [], [], [], [], []
        for n in range(self.nsteps):
            # get the actions to take a step
            actions, values = self.model.select_action(self.obs)
            mb_obs.append(np.copy(self.obs)) #start states
            mb_actions.append(actions) #actions
            mb_values.append(values) #Value of the states
            mb_dones.append(self.dones) #Done status of the episode before taking action
            
            # take a step
            obs, rewards, dones = self.env.step(actions)
            self.dones = dones
            mb_rewards.append(rewards)
            
            # check which agents returned with done=True
            for n, done in enumerate(dones):
                if done:
                    obs[n] = obs[n] * 0 # Reset obs to zeros if action led to completion of episode
            self.obs = obs
        
        mb_dones.append(self.dones) #last done stores whether episode ended after taking the last action
        mb_obs = np.asarray(mb_obs).swapaxes(1,0).reshape(self.batch_ob_shape) #first n rows are first obs of n agents and so on..
        mb_rewards = np.asarray(mb_rewards).swapaxes(1,0)
        mb_actions = np.asarray(mb_actions, dtype=np.uint8).swapaxes(1,0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1,0)
        mb_values = np.asarray(mb_values).swapaxes(1,0)
        mb_masks = mb_dones[:,:-1] # Stores done values of episode before the action is taken
        mb_dones = mb_dones[:,1:] # Stores done values of episode after the action is taken
        last_values = self.model.value(self.obs).tolist()
        
        # calculate returns for each agent
        for n, (rewards, done, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist() #helps in appending list later on
            dones = done.tolist()
            if dones[-1] == 0: #if last action led to end of episode, use last_value to estimate return
                rewards = discounted_rewards(rewards + value, dones + [False], self.gamma)[:-1] #removing the return corresponding to value as it was just needed to calculate other returns
            else: #rollout complete
                rewards = discounted_rewards(rewards, dones, self.gamma)
            mb_rewards[n] = rewards #mb_rewards now stores discounted returns rather than just rewards
        
        mb_actions = mb_actions.flatten()
        mb_rewards = mb_rewards.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        
        return mb_obs, mb_actions, mb_rewards, mb_values, mb_masks

In [9]:
class Model:
    def __init__(self, sess, obs_space, ac_space, nenvs=3, nsteps=5, env=env):
        nbatch = nenvs * nsteps # batch size. nenvs: no. of agents, nsteps: no. of steps to be taken by an agent
        A = tf.placeholder(tf.int32, [nbatch])# actions
        ADV = tf.placeholder(tf.float32, [nbatch]) # advantage
        R = tf.placeholder(tf.float32, [nbatch]) # returns
#         LR = # learning rate
        
        step_model = Policy(sess, obs_space, ac_space, nenvs, nsteps=1, reuse=False)
        train_model = Policy(sess, obs_space, ac_space, nbatch=nenvs*nsteps, nsteps=nsteps, reuse=True)
        
        # get loss
#         A_onehot = tf.one_hot(A, depth=2)
        neglogp_ac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy_logits, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogp_ac)
        vf_loss = tf.reduce_mean(tf.squared_difference(tf.squeeze(train_model.value_fn), R))
#         entropy = tf.reduce_mean(tf.softmax())
        loss = pg_loss + vf_loss * 0.5
        
        # get grads
        params = tf.trainable_variables(scope="model")
        grads = tf.gradients(loss, params)
        grads, grad_norm = tf.clip_by_global_norm(grads, clip_norm=0.5)
        grads_and_vars = list(zip(grads, params))
        # apply grads
        trainer = tf.train.RMSPropOptimizer(learning_rate=0.001)
        _train = trainer.apply_gradients(grads_and_vars)
        # update learning rate
        
        def train(obs, actions, returns, values, masks):
            advs = returns - values
            td_map = {A: actions, train_model.X_input: obs, ADV: advs, R: returns}
            policy_loss, value_loss, _ = sess.run([pg_loss, vf_loss, _train], td_map)
            return policy_loss, value_loss
        
        def test():
            cur_state = env.reset()
            rewards_run = []
            for i in range(20):
                total=0
                while True:
                    ac, val = self.select_action(cur_state)
                    next_state, rewards, dones = env.step(ac)
                    total+=rewards[0]
                    cur_state = next_state
                    if dones[0]==True:
                        break
            #     print(total)
                rewards_run.append(total)
            return np.mean(rewards_run)
        
        self.train = train
        self.test = test
        self.select_action = step_model.select_action #Give access to Policy class methods so that Runner can use them
        self.value = step_model.value 
        self.step_model = step_model
        self.train_model = train_model

# Initialize Session and Classes

In [10]:
tf.reset_default_graph() 
sess = tf.Session()
agent = A2C(num_envs=4)
model = Model(sess, nenvs=agent.nenv, nsteps=5, obs_space=agent.env.observation_space, ac_space=agent.env.action_space, env=agent.env)
init = tf.global_variables_initializer()
sess.run(init)
runner = Runner(env=agent.env, policy_model=model, nsteps=5)

# Train

In [11]:
for update in range(1*int(1e4)):
    mb_obs, mb_actions, mb_returns, mb_values, mb_masks = runner.run()
    policy_loss, value_loss = model.train(mb_obs, mb_actions, mb_returns, mb_values, mb_masks)
    if update % 100 == 0:
        print(update, policy_loss, value_loss, model.test())

0 2.0486693 10.851652 24.4
100 2.019149 10.476603 19.45
200 1.4260519 7.6669946 24.9
300 1.964711 10.820755 23.45
400 1.734512 14.58198 19.7
500 1.8278265 9.542002 18.55
600 1.4158782 12.327335 13.05
700 -0.54426223 28.9071 20.05
800 -0.21735148 36.524704 19.4
900 1.7283449 7.9389467 23.9
1000 1.7048286 7.861163 30.3
1100 -0.49795842 41.931664 33.0
1200 1.5832342 7.882852 47.85
1300 1.4206033 7.0707293 30.55
1400 1.3592732 5.885133 39.6
1500 -0.5926124 66.804535 31.75
1600 -2.1639009 139.0397 30.85
1700 0.51164323 43.425964 34.45
1800 -0.82179654 138.67229 36.35
1900 -1.8122896 151.7062 28.25
2000 1.3380375 4.597879 43.8
2100 0.19510005 124.89845 57.35
2200 1.3803068 5.2232943 40.2
2300 1.004555 3.0246568 28.2
2400 1.0422978 2.9466472 47.2
2500 1.068466 2.8858752 43.95
2600 0.9632722 2.6428998 46.35
2700 0.6283408 1.5167764 42.1
2800 0.78928244 1.9304583 40.75
2900 0.71027195 2.1154985 62.3
3000 0.55342114 1.0682518 50.0
3100 0.501516 1.189613 66.4
3200 0.43467325 0.5889653 76.25
3300 

# List of Trainable Variables in the Model

In [12]:
tf.trainable_variables()

[<tf.Variable 'model/layer1/kernel:0' shape=(4, 50) dtype=float32_ref>,
 <tf.Variable 'model/layer1/bias:0' shape=(50,) dtype=float32_ref>,
 <tf.Variable 'model/layer2/kernel:0' shape=(4, 50) dtype=float32_ref>,
 <tf.Variable 'model/layer2/bias:0' shape=(50,) dtype=float32_ref>,
 <tf.Variable 'model/policylogits/kernel:0' shape=(50, 2) dtype=float32_ref>,
 <tf.Variable 'model/policylogits/bias:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'model/valuefn/kernel:0' shape=(50, 1) dtype=float32_ref>,
 <tf.Variable 'model/valuefn/bias:0' shape=(1,) dtype=float32_ref>]

# Test

In [30]:

cur_state = agent.env.reset()
rewards_run = []
for i in range(20):
    total=0
    while True:
        ac, val = model.select_action(cur_state)
        next_state, rewards, dones = agent.env.step(ac)
        total+=rewards[0]
        cur_state = next_state
        if dones[0]==True:
            break
#     print(total)
    rewards_run.append(total)
np.mean(rewards_run)

200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0
200.0


200.0