In [1]:
%matplotlib inline

This assignment has three sections: this first section introduces and implements a standard sctor-critic algorightm; the second section implements and compares the actor-critic algorithm with Semi-gradient TD on the experiment environment of Mountain Car.

# Actor–critic Algorithms
Actor-critic algorithms use the gradient of the value function to update the policy parameters. The general form of actor-critic is shown as follows.
![alt text](algorithm.png)

In [None]:
import matplotlib
import numpy as np
#policy
ACTIONS = [-1, 0, 1]
#value function
class ValueFunction:
    def __init__(self, featuresize, steprate):
        # weights for features
        self.theta = np.random.rand(featuresize)
        self.steprate = steprate
    def value(self, phi):
        value = 0.0
        for i in range(0, len(phi)):
            if phi[i] > 0:
                value += self.theta[i]
        return value
    def update(self, gradient):
        self.theta += self.steprate * gradient
        return np.linalg.norm(self.steprate * gradient)
class PolicyEstimator:
    def __init__(self, featuresize, steprate, car):
        with tf.variable_scope("Policy"):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.action = tf.placeholder(dtype=tf.int32, name="action")
            self.target = tf.placeholder(dtype=tf.float32, name="target")
            state_one_hot = car.getState()
            #MLP
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs = tf.expand_dims(state_one_hot, 0),
                num_outputs = 3,
                activation_fn = None,
                weights_initializer = tf.zeros_initializer)
            self.action_probs = tf.squeeze(tf.nn.softmax(self.output_layer))
            self.sel_action_prob = tf.gather(self.action_probs, self.action)
            # Loss
            self.loss = -tf.log(self.sel_action_prob) * self.target
            #Optimizer
            self.optimizer = tf.train.AdamOptimizer(learning_rate = steprate)
            self.train_op = self.optimizer.minimize(self.loss, global_step = tf.contrib.framework.get_global_step())
    def actions(self, state):
        sess = tf.get_default_session()
        return sess.run(self.action_probs, { self.state: state })
    def update(self, state, target, action):
        sess = tf.get_default_session()
        feed_dict = { self.state: state, self.target: target, self.action: action  }
        return sess.run([self.train_op, self.loss], feed_dict)
    
def ActorCritic(tile_size):
    car = MountainCar(tile_size)
    learningrate = 0.1
    gamma = 0.8
    positions = np.zeros(500)
    valueFunction = ValueFunction(tile_size * 10 * 10, learningrate)
    policyEstimator = PolicyEstimator(tile_size * 10 * 10, learningrate, car)
    for i in range(500):
        state0 = car.getState()
        actions = policyEstimator.actions(state0)
        action = np.random.choice(np.arange(len(actions)), p=actions)
        reward, position = car.takeAction(ACTIONS[action])
        state1 = car.getState()
        positions[i] = position
        value_next = valueFunction.value(state1)
        target = reward + gamma * value_next
        error = target - valueFunction.predict(state0)
        #update valuefuction
        valueFunction.update(learningrate * td_error)
        #update police estimater
        policyEstimator.update(state, td_error, action)
    return positions

# Semi-gradient TD
Update function,
$ \theta_{k+1} \gets \theta_{k} + \alpha [R + \gamma \hat{v}(S^{'}, \theta_{k}) - \hat{v}(S, \theta_{k})]\phi $


In [None]:
#define Semi-gradient TD
def SemiTD(tile_size):
    car = MountainCar(tile_size)
    learningrate = 0.1
    valueFunction = ValueFunction(tile_size * 10 * 10, learningrate)
    gamma = 0.8
    reward = 0.0
    positions = np.zeros(500)
    for i in range(500):
        state0 = car.getState()
        action = getAction(car, valueFunction)
        reward, position = car.takeAction(action)
        positions[i] = position
        state1 = car.getState()
        delta = gamma * valueFunction.value(state1) + reward - valueFunction.value(state0)
        gradient = delta * state0
        diff = valueFunction.update(gradient)
        #RMSPBE
        state1 = np.matrix(state1)
        value = np.sqrt(np.dot(np.dot(delta * state1, np.transpose(state1)), \
                                     np.dot(state1, delta * np.transpose(state1))))
    return positions

# Experiment
I evaluated different **gradient-based TD methods** on the classic RL environment **Mountain Car**[1]. Each run started near the bottom of the mountain (_-0.5_) with zero velocity. Three actions [_reverse, coast, forward_] are selected in each interation and the reward is always _-1_. The position and velocity are encoded by using 4 or 8 **tile coding** with 10\*10 tilings.
![alt text](MC.png "Mountain Car")

In [None]:
POSITION_MIN = -1.2
POSITION_MAX = 0.6
VELOCITY_MIN = -0.07
VELOCITY_MAX = 0.07
RUN = 100
TILE_SIZE = 4
# Tile Coding
class TileCoding:
    def __init__(self, tiling_num, tiling_size, tile_length, tile_width, space_length, space_width):
        self.tiling_num = tiling_num
        self.tiling_size = tiling_size
        self.tile_length = tile_length
        self.tile_width = tile_width
        self.x_movement = -(tiling_size * tile_length - space_length)/tiling_num
        self.y_movement = -(tiling_size * tile_width - space_width)/tiling_num
    def genCode(self, x, y):
        tileCode = [0] * self.tiling_num * self.tiling_size * self.tiling_size
        original_x = POSITION_MIN
        original_y = VELOCITY_MIN
        for i in range(0, self.tiling_num):
            grid_x = math.floor((x - original_x)/self.tile_length)
            grid_y = math.floor((y - original_y)/self.tile_width)
            #print int(grid_x * self.tiling_size + grid_y)
            tileCode[int(grid_x * self.tiling_size + grid_y)] = 1
            original_x += self.x_movement
            original_y += self.y_movement
        return tileCode
# MountainCar with Tile Coding
class MountainCar:
    def __init__(self, tile_size):
        self.position = -0.5
        self.velocity = 0.0
        self.tile = TileCoding(tile_size, 10, 0.25, 0.02, 1.8, 0.14)#4 tiles with 10*10 tilings
    def resetState(self):
        self.position = -0.5
        self.velocity = 0.0
    def isEnd(self):
        if self.position >= 0.6:
            return True
        return False
    def takeAction(self, action):
        #action belongs to {1, 0, -1}
        velocity = self.velocity + 0.001 * action - 0.0025 * np.cos(3 * self.position)
        self.velocity = min(max(VELOCITY_MIN, velocity), VELOCITY_MAX)
        position = self.position + self.velocity
        self.position = min(max(POSITION_MIN, position), POSITION_MAX)
        reward = -1.0
        #if self.position == POSITION_MIN:
        #    self.position = 0.0
        return reward
    def testAction(self, action):
        #action belongs to {1, 0, -1}, test action
        velocity = self.velocity + 0.001 * action - 0.0025 * np.cos(3 * self.position)
        velocity = min(max(VELOCITY_MIN, velocity), VELOCITY_MAX)
        position = self.position + velocity
        position = min(max(POSITION_MIN, position), POSITION_MAX)
        reward = -1.0
        #if position == POSITION_MIN:
        #    position = 0.0
        return np.asarray(self.tile.genCode(position, velocity))
    def getState(self):
        #return state after tileCoding
        return np.asarray(self.tile.genCode(self.position, self.velocity))
tile_size = 8
STD_positions = []
for i in range(RUN):
    STD_positions.append(SemiTD(tile_size))
AC_positions = []
for i in range(RUN):
    AC_positions.append(ActorCritic(tile_size))
plt.plot(np.mean(STD_positions, axis=0), label = 'Semi-gradient TD')
plt.plot(np.mean(AC_positions, axis=0), label = 'ActorCritic')
plt.xlabel('Episode')
plt.ylabel('Position')
plt.legend(loc='best')
plt.show()
tile_size = 4
STD_positions = []
for i in range(RUN):
    STD_positions.append(SemiTD(tile_size))
AC_positions = []
for i in range(RUN):
    AC_positions.append(ActorCritic(tile_size))
plt.plot(np.mean(STD_positions, axis=0), label = 'Semi-gradient TD')
plt.plot(np.mean(AC_positions, axis=0), label = 'ActorCritic')
plt.xlabel('Episode')
plt.ylabel('Position')
plt.legend(loc='best')
plt.show()

![alt text](car0.png "SemiTD vs ActorCritic when size of tiles = 8")
![alt text](car1.png "SemiTD vs ActorCritic when size of tiles = 4")

I run experiment on a tensorflow environment separately. The following figure is ploted from the output of the methods of SemiTD and ActorCritic after 20 trials. When the size of tiles is 8, actor-critic performs more stable than SemiTD. However, When the size of tiles is 4, SemiTD performs more stable than actor-critic. It may be because when the size of tiles is 8, the policy estimator can have better estimation of actions probabilities 

# Reference
[1] Bhatnagar, S., Sutton, R., Ghavamzadeh, M., & Lee, M. (2009). Natural actor-critic algorithms. Automatica, 45(11).
[2] Konda, V. R., & Tsitsiklis, J. N. (1999, November). Actor-Critic Algorithms. In NIPS (Vol. 13, pp. 1008-1014).