## Advantage Actor Critic to Play Cart Pole

We play the cart pole game using actor critic method for reinforcement learning

### Importing Relevant Libraries

In [1]:
import numpy as np
import tensorflow as tf
import gym

In [2]:
np.random.seed(2)
tf.set_random_seed(2) # reproducible

In [3]:
## Hyperparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 3000
DISPLAY_REWARD_THRESHOLD = 200 # Renders the environment if total episode reward is greater than this threshold
MAX_EP_STEPS = 1000 # Maximum time step in one episode
RENDER = False # Rendering wastes time
GAMMA = 0.9 # Reward discount error in TD error
LR_A = 0.001 # learning rate for actor
LR_C = 0.01 # learnig rate for critic

### Creating environment

In [5]:
env = gym.make('CartPole-v0')
env.seed(1) # to ensure reproducibility
env = env.unwrapped

In [6]:
N_F = env.observation_space.shape[0]
N_A = env.action_space.n

### Defining the Actor

Below we define the actor neural network

In [46]:
class Actor(object) :
    def __init__(self,sess,n_features,n_actions,lr=0.001) :
        self.sess = sess
        
        self.s = tf.placeholder(tf.float32,[1,n_features],"state")
        self.a = tf.placeholder(tf.int32,None,"act")
        self.td_error = tf.placeholder(tf.float32,None,"td_error") # TD Error
        
        with tf.variable_scope('Actor') :
            l1 = tf.layers.dense(inputs=self.s,
                                 units=20,
                                 activation=tf.nn.relu,
                                 kernel_initializer=tf.random_normal_initializer(0.,.1),
                                 bias_initializer=tf.constant_initializer(0.1),
                                 name="l1")
            
            self.acts_prob = tf.layers.dense(inputs=l1,
                                             units=n_actions,
                                             activation=tf.nn.softmax,
                                             kernel_initializer=tf.random_normal_initializer(0.,.1),
                                             bias_initializer=tf.constant_initializer(0.1),
                                             name="acts_prob")
            
        with tf.variable_scope('exp_v') :
            log_prob = tf.log(self.acts_prob[0,self.a])
            self.exp_v = tf.reduce_mean(log_prob*self.td_error) # Advantage (TD_error) guided loss
            
        with tf.variable_scope('train') :
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)
            
    def learn(self,s,a,td) :
        s = s[np.newaxis,:]
        feed_dict = {self.s : s, self.a : a, self.td_error : td}
        _, exp_v = self.sess.run([self.train_op,self.exp_v],feed_dict)
        return exp_v
    
    def choose_action(self,s) :
        s = s[np.newaxis,:]
        probs = self.sess.run(self.acts_prob,{self.s:s}) # get probabilities for all actions
        return np.random.choice(np.arange(probs.shape[1]),p=probs.ravel()) # return a int
            

### Defining the Critic

Below we define the critic neural network

In [47]:
class Critic(object) :
    def __init__(self,sess,n_features,lr=0.01) :
        self.sess = sess
        
        self.s = tf.placeholder(tf.float32,[1,n_features],"state")
        self.v_ = tf.placeholder(tf.float32,[1,1],"v_next")
        self.r = tf.placeholder(tf.float32,None,'r')
        
        with tf.variable_scope('Critic') :
            
            l1 = tf.layers.dense(inputs=self.s,
                                 units=20,
                                 activation=tf.nn.relu,
                                 kernel_initializer=tf.random_normal_initializer(0.,.1),
                                 bias_initializer=tf.constant_initializer(0.1),
                                 name='l1')
            
            self.v = tf.layers.dense(inputs=l1,units=1,
                                     activation=None,
                                     kernel_initializer=tf.random_normal_initializer(0.,.1),
                                     bias_initializer=tf.constant_initializer(0.1),
                                     name="V")
            
        with tf.variable_scope('squared_TD_error') :
            self.td_error = self.r + GAMMA*self.v_ - self.v
            self.loss = tf.square(self.td_error) # TD_error = (r + gamma*V_next) - V_eval
            
        with tf.variable_scope('train') :
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
            
    def learn(self,s,r,s_) :
        
        s,s_ = s[np.newaxis,:], s_[np.newaxis,:]
        
        v_ = self.sess.run(self.v,{self.s:s_})
        td_error, _ = self.sess.run([self.td_error,self.train_op],{self.s:s, self.v_ : v_, self.r : r})
        
        return td_error

In [51]:
tf.reset_default_graph()
sess = tf.Session()

actor = Actor(sess,n_features=N_F,n_actions=N_A,lr=LR_A)
critic = Critic(sess,n_features=N_F,lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor

sess.run(tf.global_variables_initializer())

In [52]:
if OUTPUT_GRAPH :
    tf.summary.FileWriter("logs/",sess.graph)
    
for i_episode in range(MAX_EPISODE) :
    s = env.reset()
    t = 0
    track_r = []
    
    while True :
        if RENDER :
            env.render()
            
        a = actor.choose_action(s)
        s_, r, done, info = env.step(a)
        
        if done :
            r = -20
            
        track_r.append(r)
        
        td_error = critic.learn(s,a,s_) # gradient  = grad[r + gamma*V(s_) - V(s)]
        actor.learn(s,a,td_error) # true gradient = grad[logPi(s,a)*td_error]
        
        s = s_
        t += 1
        
        if done or t >= MAX_EP_STEPS :
            
            ep_rs_sum = sum(track_r)
            
            if 'running_reward' not in globals() :
                running_reward = ep_rs_sum
            else :
                running_reward = running_reward*0.95 + ep_rs_sum*0.05
                
            if running_reward > DISPLAY_REWARD_THRESHOLD :
                RENDER = True # Rendering
                
            print("episode:",i_episode," reward:",int(running_reward))
            break

episode: 0  reward: -6
episode: 1  reward: -5
episode: 2  reward: -5
episode: 3  reward: -3
episode: 4  reward: -3
episode: 5  reward: -3
episode: 6  reward: -3
episode: 7  reward: -4
episode: 8  reward: -4
episode: 9  reward: -4
episode: 10  reward: -4
episode: 11  reward: -5
episode: 12  reward: -5
episode: 13  reward: -5
episode: 14  reward: -6
episode: 15  reward: -6
episode: 16  reward: -6
episode: 17  reward: -6
episode: 18  reward: -6
episode: 19  reward: -7
episode: 20  reward: -7
episode: 21  reward: -7
episode: 22  reward: -7
episode: 23  reward: -7
episode: 24  reward: -8
episode: 25  reward: -8
episode: 26  reward: -8
episode: 27  reward: -8
episode: 28  reward: -9
episode: 29  reward: -9
episode: 30  reward: -9
episode: 31  reward: -9
episode: 32  reward: -9
episode: 33  reward: -9
episode: 34  reward: -9
episode: 35  reward: -9
episode: 36  reward: -9
episode: 37  reward: -9
episode: 38  reward: -9
episode: 39  reward: -10
episode: 40  reward: -10
episode: 41  reward: -10

episode: 323  reward: -11
episode: 324  reward: -11
episode: 325  reward: -11
episode: 326  reward: -11
episode: 327  reward: -11
episode: 328  reward: -11
episode: 329  reward: -11
episode: 330  reward: -11
episode: 331  reward: -11
episode: 332  reward: -11
episode: 333  reward: -11
episode: 334  reward: -11
episode: 335  reward: -11
episode: 336  reward: -11
episode: 337  reward: -11
episode: 338  reward: -11
episode: 339  reward: -11
episode: 340  reward: -11
episode: 341  reward: -11
episode: 342  reward: -11
episode: 343  reward: -11
episode: 344  reward: -11
episode: 345  reward: -11
episode: 346  reward: -11
episode: 347  reward: -11
episode: 348  reward: -11
episode: 349  reward: -11
episode: 350  reward: -11
episode: 351  reward: -10
episode: 352  reward: -10
episode: 353  reward: -10
episode: 354  reward: -10
episode: 355  reward: -10
episode: 356  reward: -10
episode: 357  reward: -10
episode: 358  reward: -10
episode: 359  reward: -10
episode: 360  reward: -10
episode: 361

episode: 638  reward: -11
episode: 639  reward: -11
episode: 640  reward: -11
episode: 641  reward: -11
episode: 642  reward: -11
episode: 643  reward: -11
episode: 644  reward: -11
episode: 645  reward: -11
episode: 646  reward: -11
episode: 647  reward: -11
episode: 648  reward: -11
episode: 649  reward: -11
episode: 650  reward: -11
episode: 651  reward: -10
episode: 652  reward: -10
episode: 653  reward: -10
episode: 654  reward: -10
episode: 655  reward: -10
episode: 656  reward: -10
episode: 657  reward: -10
episode: 658  reward: -10
episode: 659  reward: -10
episode: 660  reward: -10
episode: 661  reward: -10
episode: 662  reward: -10
episode: 663  reward: -10
episode: 664  reward: -10
episode: 665  reward: -10
episode: 666  reward: -10
episode: 667  reward: -10
episode: 668  reward: -10
episode: 669  reward: -10
episode: 670  reward: -10
episode: 671  reward: -10
episode: 672  reward: -10
episode: 673  reward: -10
episode: 674  reward: -10
episode: 675  reward: -10
episode: 676

episode: 956  reward: -11
episode: 957  reward: -11
episode: 958  reward: -11
episode: 959  reward: -11
episode: 960  reward: -11
episode: 961  reward: -11
episode: 962  reward: -11
episode: 963  reward: -11
episode: 964  reward: -11
episode: 965  reward: -11
episode: 966  reward: -11
episode: 967  reward: -11
episode: 968  reward: -11
episode: 969  reward: -11
episode: 970  reward: -11
episode: 971  reward: -11
episode: 972  reward: -11
episode: 973  reward: -11
episode: 974  reward: -11
episode: 975  reward: -11
episode: 976  reward: -11
episode: 977  reward: -11
episode: 978  reward: -11
episode: 979  reward: -11
episode: 980  reward: -11
episode: 981  reward: -11
episode: 982  reward: -11
episode: 983  reward: -11
episode: 984  reward: -11
episode: 985  reward: -11
episode: 986  reward: -11
episode: 987  reward: -11
episode: 988  reward: -11
episode: 989  reward: -11
episode: 990  reward: -11
episode: 991  reward: -11
episode: 992  reward: -11
episode: 993  reward: -11
episode: 994

episode: 1269  reward: -11
episode: 1270  reward: -11
episode: 1271  reward: -11
episode: 1272  reward: -11
episode: 1273  reward: -11
episode: 1274  reward: -11
episode: 1275  reward: -11
episode: 1276  reward: -11
episode: 1277  reward: -11
episode: 1278  reward: -11
episode: 1279  reward: -11
episode: 1280  reward: -11
episode: 1281  reward: -11
episode: 1282  reward: -11
episode: 1283  reward: -11
episode: 1284  reward: -11
episode: 1285  reward: -11
episode: 1286  reward: -11
episode: 1287  reward: -11
episode: 1288  reward: -11
episode: 1289  reward: -11
episode: 1290  reward: -11
episode: 1291  reward: -11
episode: 1292  reward: -11
episode: 1293  reward: -11
episode: 1294  reward: -11
episode: 1295  reward: -11
episode: 1296  reward: -11
episode: 1297  reward: -11
episode: 1298  reward: -11
episode: 1299  reward: -11
episode: 1300  reward: -11
episode: 1301  reward: -11
episode: 1302  reward: -11
episode: 1303  reward: -11
episode: 1304  reward: -11
episode: 1305  reward: -11
e

episode: 1576  reward: -11
episode: 1577  reward: -11
episode: 1578  reward: -11
episode: 1579  reward: -11
episode: 1580  reward: -11
episode: 1581  reward: -11
episode: 1582  reward: -11
episode: 1583  reward: -11
episode: 1584  reward: -11
episode: 1585  reward: -11
episode: 1586  reward: -11
episode: 1587  reward: -11
episode: 1588  reward: -11
episode: 1589  reward: -11
episode: 1590  reward: -11
episode: 1591  reward: -11
episode: 1592  reward: -11
episode: 1593  reward: -11
episode: 1594  reward: -11
episode: 1595  reward: -11
episode: 1596  reward: -11
episode: 1597  reward: -11
episode: 1598  reward: -11
episode: 1599  reward: -11
episode: 1600  reward: -11
episode: 1601  reward: -11
episode: 1602  reward: -11
episode: 1603  reward: -11
episode: 1604  reward: -11
episode: 1605  reward: -11
episode: 1606  reward: -11
episode: 1607  reward: -11
episode: 1608  reward: -11
episode: 1609  reward: -11
episode: 1610  reward: -11
episode: 1611  reward: -11
episode: 1612  reward: -11
e

episode: 1880  reward: -11
episode: 1881  reward: -11
episode: 1882  reward: -11
episode: 1883  reward: -11
episode: 1884  reward: -11
episode: 1885  reward: -11
episode: 1886  reward: -11
episode: 1887  reward: -11
episode: 1888  reward: -11
episode: 1889  reward: -11
episode: 1890  reward: -11
episode: 1891  reward: -11
episode: 1892  reward: -11
episode: 1893  reward: -11
episode: 1894  reward: -11
episode: 1895  reward: -11
episode: 1896  reward: -11
episode: 1897  reward: -11
episode: 1898  reward: -11
episode: 1899  reward: -11
episode: 1900  reward: -11
episode: 1901  reward: -11
episode: 1902  reward: -11
episode: 1903  reward: -11
episode: 1904  reward: -11
episode: 1905  reward: -11
episode: 1906  reward: -11
episode: 1907  reward: -11
episode: 1908  reward: -11
episode: 1909  reward: -11
episode: 1910  reward: -11
episode: 1911  reward: -11
episode: 1912  reward: -11
episode: 1913  reward: -11
episode: 1914  reward: -11
episode: 1915  reward: -11
episode: 1916  reward: -11
e

episode: 2188  reward: -11
episode: 2189  reward: -11
episode: 2190  reward: -11
episode: 2191  reward: -11
episode: 2192  reward: -11
episode: 2193  reward: -11
episode: 2194  reward: -11
episode: 2195  reward: -11
episode: 2196  reward: -11
episode: 2197  reward: -11
episode: 2198  reward: -11
episode: 2199  reward: -11
episode: 2200  reward: -11
episode: 2201  reward: -11
episode: 2202  reward: -11
episode: 2203  reward: -11
episode: 2204  reward: -11
episode: 2205  reward: -11
episode: 2206  reward: -11
episode: 2207  reward: -11
episode: 2208  reward: -11
episode: 2209  reward: -11
episode: 2210  reward: -11
episode: 2211  reward: -11
episode: 2212  reward: -11
episode: 2213  reward: -11
episode: 2214  reward: -11
episode: 2215  reward: -11
episode: 2216  reward: -11
episode: 2217  reward: -11
episode: 2218  reward: -11
episode: 2219  reward: -11
episode: 2220  reward: -11
episode: 2221  reward: -11
episode: 2222  reward: -11
episode: 2223  reward: -11
episode: 2224  reward: -11
e

episode: 2498  reward: -11
episode: 2499  reward: -11
episode: 2500  reward: -11
episode: 2501  reward: -11
episode: 2502  reward: -11
episode: 2503  reward: -11
episode: 2504  reward: -11
episode: 2505  reward: -11
episode: 2506  reward: -11
episode: 2507  reward: -11
episode: 2508  reward: -11
episode: 2509  reward: -11
episode: 2510  reward: -11
episode: 2511  reward: -11
episode: 2512  reward: -11
episode: 2513  reward: -11
episode: 2514  reward: -11
episode: 2515  reward: -11
episode: 2516  reward: -11
episode: 2517  reward: -11
episode: 2518  reward: -11
episode: 2519  reward: -11
episode: 2520  reward: -11
episode: 2521  reward: -11
episode: 2522  reward: -11
episode: 2523  reward: -11
episode: 2524  reward: -11
episode: 2525  reward: -11
episode: 2526  reward: -11
episode: 2527  reward: -11
episode: 2528  reward: -11
episode: 2529  reward: -11
episode: 2530  reward: -11
episode: 2531  reward: -11
episode: 2532  reward: -11
episode: 2533  reward: -11
episode: 2534  reward: -11
e

episode: 2807  reward: -11
episode: 2808  reward: -11
episode: 2809  reward: -11
episode: 2810  reward: -11
episode: 2811  reward: -11
episode: 2812  reward: -11
episode: 2813  reward: -11
episode: 2814  reward: -11
episode: 2815  reward: -11
episode: 2816  reward: -11
episode: 2817  reward: -11
episode: 2818  reward: -11
episode: 2819  reward: -11
episode: 2820  reward: -11
episode: 2821  reward: -11
episode: 2822  reward: -11
episode: 2823  reward: -11
episode: 2824  reward: -11
episode: 2825  reward: -11
episode: 2826  reward: -11
episode: 2827  reward: -11
episode: 2828  reward: -11
episode: 2829  reward: -11
episode: 2830  reward: -11
episode: 2831  reward: -11
episode: 2832  reward: -11
episode: 2833  reward: -11
episode: 2834  reward: -11
episode: 2835  reward: -11
episode: 2836  reward: -11
episode: 2837  reward: -11
episode: 2838  reward: -11
episode: 2839  reward: -11
episode: 2840  reward: -11
episode: 2841  reward: -11
episode: 2842  reward: -11
episode: 2843  reward: -11
e