In [1]:
import gym
import numpy as np
import tensorflow as tf

In [311]:
# Create our policy network
class PolicyNetwork:
    '''
    Given a state, it outputs the action probabilities
    '''
    def __init__(self,state_size,action_size,learning_rate=1e-5,name='PolicyNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.name = name
        self.build_model()
    
    def build_model(self):
        # Define placeholders
        with tf.variable_scope(self.name,reuse=tf.AUTO_REUSE):
            with tf.name_scope(self.name + '_placeholders'):
                self.advantage = tf.placeholder(shape=[None],name='advantage',dtype=tf.float32)
                self.actions_ = tf.placeholder(shape=[None,self.action_size],name='actions_',dtype=tf.float32)
                self.state = tf.placeholder(shape=[None,self.state_size],name='state',dtype=tf.float32)
            
            # Build the network
            # We just have a 3 full connected layers...That's it!
#             self.fc_1 = tf.layers.dense(self.state,128,name='fc_1'\
#                                         ,kernel_initializer=tf.contrib.layers.xavier_initializer())
#             self.fc_2 = tf.layers.dense(self.fc_1,256,name='fc_2'
#                                        ,kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.output = tf.layers.dense(self.state,action_size,name='output',activation=None
                                       ,kernel_initializer=tf.zeros_initializer())
            
            # Action probability distribution
            self.actions = tf.nn.sigmoid(self.output)
            
            # Take this as a supervised learning problem, we have the labels as the actions
            self.neg_log_probs = tf.nn.softmax_cross_entropy_with_logits(logits=self.output,labels=self.actions_)
            
            self.loss = tf.reduce_mean(self.neg_log_probs*self.advantage)
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
            
    def predict(self,sess,state):
        '''
        Predict the action probabilities
        
        state : the numpy array of the state passed
        '''
        action_probs = sess.run(self.actions,feed_dict={self.state:state})
        return action_probs
    
    def update(self,sess,state,actions,advantage):
        '''
        Make the optimizer update
        state : the state for which we make the update
        '''
        _,loss = sess.run([self.optimizer,self.loss],feed_dict={self.state:state,\
                                                                self.actions_:actions,\
                                                                self.advantage:advantage})
        return loss

In [312]:
class ValueEstimatorNetwork:
    '''
    This computes the generalized advantage approximated by the TD error on the State-Value function
    TD-error = r + gamma*V(s_t+1) - V(s_t)
    '''
    def __init__(self,state_size,learning_rate=1e-5,gamma=0.99,name = 'AdvantageNetwork'):
        self.state_size = state_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.name = name
        self.build_model()
        
    def build_model(self):
        with tf.variable_scope(self.name):
                # Define placeholders
                with tf.name_scope('Placeholders'):
                    self.state = tf.placeholder(shape=[None,self.state_size],name='state',dtype=tf.float32)
                    self.target = tf.placeholder(shape=[None],name='target',dtype=tf.float32)
                
                # Build the network
#                 self.fc_1 = tf.layers.dense(self.state,128,name='fc_1',\
#                                            kernel_initializer=tf.contrib.layers.xavier_initializer())
                self.output = tf.layers.dense(self.state,1,name='output',\
                                             activation=None,kernel_initializer=tf.zeros_initializer())
                
                # output is the value estimate
                self.loss = tf.square(self.output - self.target)
                
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
    
    def predict(self,sess,state):
        '''
        Predict the value estimate
        
        state : the numpy array of the state passed
        '''
        value_estimate = sess.run(self.output,feed_dict={self.state:state})
        return value_estimate
    
    def update(self,sess,state,target):
        '''
        Make the optimizer update
        state : the state for which we make the update
        Update made : V(s) = r + gamma*V(s')
        '''
        _,loss = sess.run([self.optimizer,self.loss],feed_dict={self.state:state,self.target:target})
        return loss

In [313]:
class ActorCriticNetwork:
    def __init__(self,env,state_size,action_size,actor_lr=1e-5,critic_lr=1e-5,\
                 gamma=0.99,num_episodes=1000,episode_length=1000,\
                 render=False,name='ACNetwork'):
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        self.name = name
        self.num_episodes = num_episodes
        self.episode_length = episode_length
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.render = render
        
        self.build_model()
    
    def build_model(self):
        '''
        Actor Network is the Policy Network
        Critic Network is the Value Estimator Network
        '''
        tf.reset_default_graph()
        self.actor_network = PolicyNetwork(self.state_size,self.action_size,learning_rate=self.actor_lr,name='ActorNetwork')
        self.critic_network = ValueEstimatorNetwork(self.state_size,learning_rate=self.critic_lr,name='CriticNetwork')
    
    def train(self):
        '''
        Train the actor critic network
        '''
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(tf.local_variables_initializer())
        
        all_rewards = []
        
        for episode in range(self.num_episodes):
            state = self.env.reset()
            
            # Episode Statistics
            total_reward = 0.0 
            
            while True:
                if self.render is True:
                    self.env.render()

                # Choose an action
                action_probs = np.array(self.actor_network.predict(self.sess,state.reshape(1,self.state_size))).reshape(self.action_size)

                # Apparently numpy gives an error of probabilites not summing upto one otherwise
                action_probs/=sum(action_probs)
#                 print(action_probs)
                action = np.random.choice(np.arange(len(action_probs)),p=action_probs)
                
#                 print(action)
                actions_one_hot = np.zeros((1,self.action_size))
                actions_one_hot[0,action] = 1.0
                
                # Take a step in the environment
                next_state,reward,done,_ = self.env.step(action)
                
                # Compute The TD-Error to be used as the Advantage
                V_s = np.array(self.critic_network.predict(self.sess,state.reshape(1,self.state_size)).reshape(1))
                V_s_next = np.array(self.critic_network.predict(self.sess,next_state.reshape(1,self.state_size)).reshape(1))
                
                V_target = reward + self.gamma*V_s_next
                TD_error = reward + self.gamma*V_s_next - V_s
                                
                # Update the critic network
                self.critic_network.update(self.sess,state.reshape(1,self.state_size),V_target)
                
                # Update the actor network
                self.actor_network.update(self.sess,state.reshape(1,self.state_size),actions_one_hot,TD_error)
       
                # Update the statistics 
                total_reward+=reward
                
                state = next_state
                
                if done:
                    all_rewards.append(total_reward)
                    break
            
            mean_reward = np.sum(all_rewards)/(1.0*(episode + 1))
            print('Episode : {}\nTotal Reward : {}\nMean Reward : {}'.format(episode,total_reward,mean_reward))

In [314]:
env = gym.make('CartPole-v0')

# For the cartpole environment
state_size = 4
action_size = 2

In [315]:
actorCriticNet = ActorCriticNetwork(env,state_size,action_size,actor_lr=0.01,critic_lr=0.1)

In [316]:
actorCriticNet.train()

Episode : 0
Total Reward : 20.0
Mean Reward : 20.0
Episode : 1
Total Reward : 22.0
Mean Reward : 21.0
Episode : 2
Total Reward : 12.0
Mean Reward : 18.0
Episode : 3
Total Reward : 10.0
Mean Reward : 16.0
Episode : 4
Total Reward : 19.0
Mean Reward : 16.6
Episode : 5
Total Reward : 35.0
Mean Reward : 19.666666666666668
Episode : 6
Total Reward : 22.0
Mean Reward : 20.0
Episode : 7
Total Reward : 15.0
Mean Reward : 19.375
Episode : 8
Total Reward : 12.0
Mean Reward : 18.555555555555557
Episode : 9
Total Reward : 17.0
Mean Reward : 18.4
Episode : 10
Total Reward : 34.0
Mean Reward : 19.818181818181817
Episode : 11
Total Reward : 10.0
Mean Reward : 19.0
Episode : 12
Total Reward : 16.0
Mean Reward : 18.76923076923077
Episode : 13
Total Reward : 14.0
Mean Reward : 18.428571428571427
Episode : 14
Total Reward : 13.0
Mean Reward : 18.066666666666666
Episode : 15
Total Reward : 11.0
Mean Reward : 17.625
Episode : 16
Total Reward : 9.0
Mean Reward : 17.11764705882353
Episode : 17
Total Reward :

Episode : 136
Total Reward : 10.0
Mean Reward : 10.481751824817518
Episode : 137
Total Reward : 9.0
Mean Reward : 10.471014492753623
Episode : 138
Total Reward : 10.0
Mean Reward : 10.467625899280575
Episode : 139
Total Reward : 11.0
Mean Reward : 10.471428571428572
Episode : 140
Total Reward : 10.0
Mean Reward : 10.46808510638298
Episode : 141
Total Reward : 9.0
Mean Reward : 10.45774647887324
Episode : 142
Total Reward : 11.0
Mean Reward : 10.461538461538462
Episode : 143
Total Reward : 9.0
Mean Reward : 10.45138888888889
Episode : 144
Total Reward : 8.0
Mean Reward : 10.434482758620689
Episode : 145
Total Reward : 10.0
Mean Reward : 10.431506849315069
Episode : 146
Total Reward : 9.0
Mean Reward : 10.421768707482993
Episode : 147
Total Reward : 10.0
Mean Reward : 10.41891891891892
Episode : 148
Total Reward : 9.0
Mean Reward : 10.409395973154362
Episode : 149
Total Reward : 9.0
Mean Reward : 10.4
Episode : 150
Total Reward : 11.0
Mean Reward : 10.403973509933774
Episode : 151
Total 

Episode : 265
Total Reward : 10.0
Mean Reward : 10.037593984962406
Episode : 266
Total Reward : 8.0
Mean Reward : 10.02996254681648
Episode : 267
Total Reward : 10.0
Mean Reward : 10.029850746268657
Episode : 268
Total Reward : 9.0
Mean Reward : 10.026022304832713
Episode : 269
Total Reward : 8.0
Mean Reward : 10.018518518518519
Episode : 270
Total Reward : 9.0
Mean Reward : 10.014760147601477
Episode : 271
Total Reward : 9.0
Mean Reward : 10.011029411764707
Episode : 272
Total Reward : 9.0
Mean Reward : 10.007326007326007
Episode : 273
Total Reward : 10.0
Mean Reward : 10.007299270072993
Episode : 274
Total Reward : 10.0
Mean Reward : 10.007272727272728
Episode : 275
Total Reward : 9.0
Mean Reward : 10.003623188405797
Episode : 276
Total Reward : 8.0
Mean Reward : 9.99638989169675
Episode : 277
Total Reward : 9.0
Mean Reward : 9.992805755395683
Episode : 278
Total Reward : 9.0
Mean Reward : 9.989247311827956
Episode : 279
Total Reward : 10.0
Mean Reward : 9.989285714285714
Episode : 2

KeyboardInterrupt: 

In [11]:
env = gym.make('Pendulum-v0')

In [12]:
env.observation_space

Box(3,)

In [13]:
env.action_space

Box(1,)

In [14]:
state = env.reset()
state

array([ 0.86728292,  0.49781556, -0.26410778])