In [8]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches

In [9]:
import tile_coding

In [10]:
def tf_reset():
    try:
        sess.close()
    except:
        pass
    tf.reset_default_graph()
    return tf.Session()

In [16]:
class MountainCar():
    def __init__():
        self.position_min = tf.constant(-1.2, dtype=tf.float32)
        self.position_max = tf.constant(0.5, dtype=tf.float32)
        self.position_range = self.position_max - self.position_min
        self.position = tf.random.uniform([], .4, .6)
        
        self.velocity_min = tf.constant(-0.07, dtype=tf.float32)
        self.velocity_max = tf.constant(0.07, dtype=tf.float32)
        self.velocity_range = self.velocity_max - self.velocity_min
        self.velocity = tf.constant(0.0, dtype=tf.float32)
        
        self.time = 0
        self.iht = IHT(2048)
        self.num_tilings = 8
        
    def bound_position(self, x):
        return tf.clip_by_value(x , self.position_min, self.position_max)
        
    def bound_velocity(self, x):
        return tf.clip_by_value(x , self.velocity_min, self.velocity_max)
    
    def step(self, action):
        self.position = bound_position(self.position + self.update_velocity())
        self.time += 1
        return self.get_reward(), self.get_state()
    
    def update_velocity(self, action):
        self.velocity = bound_velocity(self.velocity + 0.001 * action - 0.0025 * tf.math.cos(3 * self.position))
        
    def get_possible_actions(self):
        return tf.constant([-1.0, 0,0, 1.0],  dtype=tf.float32)
    
    def get_state(self):
        return self.position, self.velocity
    
    def get_feature_vector(self, a):
        # Rescale position (x, velocity) to (0-10, 0-10)
        state = ((self.position / self.position_range) * 10.0,(self.velocity / self.velocity_range) * 10.0, a * 5.0)
        return tiles(self.iht, self.num_tilings, state)
        
    def get_reward(self):
        if self.position == self.position_max:
            return 1.0
        return -1.0
        
    def is_terminal(self):
        return self.get_reward() == 1.0

In [17]:
def create_model(input_dim, output_dim):
    # create inputs
    input_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
    output_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])

    # create variables
    W0 = tf.get_variable(name='W0', shape=[1, input_dim], initializer=tf.contrib.layers.xavier_initializer())
    W1 = tf.get_variable(name='W1', shape=[input_dim, input_dim], initializer=tf.contrib.layers.xavier_initializer())
    W2 = tf.get_variable(name='W2', shape=[input_dim, 1], initializer=tf.contrib.layers.xavier_initializer())

    b0 = tf.get_variable(name='b0', shape=[input_dim], initializer=tf.constant_initializer(0.))
    b1 = tf.get_variable(name='b1', shape=[input_dim], initializer=tf.constant_initializer(0.))
    b2 = tf.get_variable(name='b2', shape=[output_dim], initializer=tf.constant_initializer(0.))

    weights = [W0, W1, W2]
    biases = [b0, b1, b2]
    activations = [tf.nn.relu, tf.nn.relu, tf.nn.softmax]

    # create computation graph
    layer = input_ph
    for W, b, activation in zip(weights, biases, activations):
        layer = tf.matmul(layer, W) + b
        if activation is not None:
            layer = activation(layer)
    output_pred = layer
    
    return input_ph, output_ph, output_pred    

In [22]:
class LinearEstimator():
    def __init__(env, n):
        self.env = env
        self.w = tf.get_variable(name='w', shape=(env.get_feature_vector().shape), dtype=tf.bool, initializer=tf.constant_initializer(0.))
        
    def get_best_action(s):
        # compute the function values
        f = lambda a: self.q_hat(s, a)
        q = tf.map_fn(f.q_hat, self.env.get_possible_actions());

        # determine the index of the input tensor that maximizes the function
        index = sess.run(tf.argmax(q, axis = 0));
        
    def q_hat(s, a):
        tf.tensordot(self.w, self.env.get_feature_vector(a), 1)
        

In [21]:
def episodic_semi_gradient_n_step_sarsa(env, estimator, alpha=.5, epsilon=.1, discount_factor=.9, n=8):
    num_episodes = 100
    for ep in range(num_episodes):
        S = tf.get_variable(name='S', shape=((n-1) + env.get_state().shape), dtype=tf.bool, initializer=tf.constant_initializer(var_init_value))
        A = tf.get_variable(name='A', shape=(n - 1), dtype=tf.float32, initializer = [0] * (n - 1))
        R = tf.get_variable(name='R', shape=(n - 1), dtype=tf.float32, initializer = [0.0] * (n - 1))
        S[0] = env.get_state()
        A[0] = estimator.get_best_action(S[0])
        R[0] = 0
        t = 0
        tau = 0
        T = np.iinfo(im.dtype).max
        while tau != T - 1:
            if t < T:
                R[(t + 1) % (n + 1)], S[(t + 1) % (n + 1)] = act(A[t])
                if env.is_terminal():
                    T = t + 1
                else:
                    A[(t + 1) % (n + 1)] = estimator.get_best_action(q_hat(S[(t + 1) % (n + 1)]))
            tau = t - n + 1
            if tau >= 0:
                if tau + n < T:
                    G = np.sum([np.power(discount_factor,i - tau - 1) * R[i % n + 1] for i in range(tau + 1, np.min(tau+n, T))])
                else:
                    G = G + np.power(discount_factor, n) * q_hat(S[(t + n) % (n + 1)], A[(t + n) % (n + 1)])
                update_weights(alpha)
            t += 1

In [25]:
env = MountainCar()
n = 8
episodic_semi_gradient_n_step_sarsa(env, LinearEstimator(env, n), n=8)

TypeError: __init__() takes 0 positional arguments but 1 was given