In [1]:
# imports
import gym
from gym import wrappers
import mujoco_py
from collections import deque
import random
import numpy as np
import tensorflow as tf
import tflearn

running build_ext
Instructions for updating:
non-resource variables are not supported in the long term
curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [105]:
class ReplayBuffer(object):
    
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.count = 0
        self.buf = deque()
        
    def add(self, state, action, reward, done, state2):
        # Adds a given experience to the replay buffer
        
        exp = (state, action, reward, done, state2)
        
        if self.count >= self.buf_size:
            self.buf.popleft()
            
        self.buf.append(exp)
        
    def size(self):
        return self.count
    
    def sample(self, size):
        batch = []
        
        if self.count < size:
            batch = random.sample(self.buf, self.count)
        else:
            batch = random.sample(self.buf, size)
            
        states = np.array([])
        actions = np.array([])
        rewards = np.array([])
        dones = np.array([])
        state2s = np.array([])
            
        for exp in batch:
            states = np.append(states, exp[0])
            actions = np.append(states, exp[1])
            rewards = np.append(states, exp[2])
            dones = np.append(states, exp[3])
            state2s = np.append(states, exp[4])
            
        return states, actions, rewards, dones, state2s
    
    def clear(self):
        self.buf.clear()
        self.count = 0

In [190]:
class ActorNetwork(object):
    """
    A tensorflow deep neural network which will determine the actions for the agent to take, given only state
    """
    
    def __init__(self, state_dim, action_dim, action_bound, learning_rate, p, batch_size):
        
        # Initalize Hyperparameters from input
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.p = p
        self.batch_size = batch_size
        
        # Main Actor
        self.inputs, self.out, self.scaled_out = self.create_actor()
        self.network_params = tf.compat.v1.trainable_variables()
        
        # Target Actor
        self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor()
        self.target_network_params = tf.compat.v1.trainable_variables()[len(self.network_params):]
        
        
        # TFLearn Operation for updating our target network during training
        # self.update_target_network_params = \
            
        
        # Get the gradient of the actions, this will be used to optimize our loss
        self.action_grad = tf.compat.v1.placeholder(tf.float32, [None, self.action_dim])
        
        self.unnormalized_actor_gradients = tf.gradients(
            self.scaled_out, self.network_params, -self.action_grad)
        self.actor_gradients = list(map(lambda x: tf.divide(x, self.batch_size), self.unnormalized_actor_gradients))

        # Initialize the optimization
        self.optimize = tf.compat.v1.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))

        self.num_trainable_vars = len(
            self.network_params) + len(self.target_network_params)
        
    def create_actor(self):
        # Use the tflearn API to create a neural network as described in the DDPG formulation
        
        # Get an inputs object
        inputs = tflearn.input_data(shape=[self.state_dim, None])
        
        # Layer1
        net = tflearn.fully_connected(inputs, 256)
        net = tflearn.layers.normalization.batch_normalization(net)
        net = tflearn.activations.relu(net)
        
        # Layer2
        net = tflearn.fully_connected(net, 256)
        net = tflearn.layers.normalization.batch_normalization(net)
        net = tflearn.activations.relu(net)
        
        # Initialize our weights to random values and define our output network
        w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
        out = tflearn.fully_connected(
            net, self.action_dim, activation='tanh', weights_init=w_init)      
        
        # Return a scaled output which is between the action bounds
        scaled_out = tf.multiply(out, self.action_bound)
        
        return inputs, out, scaled_out
    
    
    def train(self, inputs, a_gradient):
        # Train the created neural network with given inputs and gradient        
        self.optimize(inputs, a_gradient)

    def predict(self, inputs):
        # Predict the best action for the given state using the main actor
        model = tflearn.DNN(self.scaled_out)
     
        return model.predict(inputs)

    def predict_target(self, inputs):
        # Predict the best action for the given state, using the target actor
        model = tflearn.DNN(self.target_scaled_out)
        return model.predict(inputs)

    def update_target_network(self):
        # Call the already create tflearn Operation to update the target network
        [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.p) +
                                                  tf.multiply(self.target_network_params[i], 1. - self.p))
                for i in range(len(self.target_network_params))]

    def get_num_trainable_vars(self):
        return self.num_trainable_vars
        

In [191]:
class CriticNetwork(object):
    """
    A tensorflow deep neural network which will determine the Q value for a given state and the best action from the actor network
    """

    def __init__(self, state_dim, action_dim, learning_rate, p, gamma, num_actor_vars):
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.p = p
        self.gamma = gamma

        # Main Critic
        self.inputs, self.action, self.out = self.create_critic_network()

        self.network_params = tf.compat.v1.trainable_variables()[num_actor_vars:]

        # Target Critic
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network()

        self.target_network_params = tf.compat.v1.trainable_variables()[(len(self.network_params) + num_actor_vars):]
        
        # Tensorflow Operation for updating the target network
        # self.update_target_network_params = \
            

        self.predicted_q_value = tf.compat.v1.placeholder(tf.float32, [None, 1])

        # Define loss and optimization
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        self.optimize = tf.compat.v1.train.AdamOptimizer(
            self.learning_rate).minimize(self.loss)

        # Get the gradient actions
        self.action_grads = tf.gradients(self.out, self.action)

    def create_critic_network(self):
        inputs = tflearn.input_data(shape=[None, self.s_dim])
        action = tflearn.input_data(shape=[None, self.a_dim])
        
        # Layer1
        net = tflearn.fully_connected(inputs, 256)
        net = tflearn.layers.normalization.batch_normalization(net)
        net = tflearn.activations.relu(net)

        # Layer2
        t1 = tflearn.fully_connected(net, 256)
        t2 = tflearn.fully_connected(action, 256)

        net = tflearn.activation(
            tf.matmul(net, t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu')

        # init our weights to random values and finish with a single linear activation function
        w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
        out = tflearn.fully_connected(net, 1, weights_init=w_init)
        return inputs, action, out

    def train(self, inputs, action, predicted_q_value):
        # Train this network on a given set of inputs, action, and a predicted value from the target network
        
        out = self.out(inputs, action)
        optimized = self.optimize(predicted_q_value)
        
        return out, optimized

    def predict(self, inputs, action):
        # Predict based on the main critic
        model = tflearn.DNN(self.out)
        return model.predict(inputs, actions)

    def predict_target(self, inputs, action):
        # Predict based on the target critic
        model = tflearn.DNN(self.target_out)
        return model.predict(inputs, action)

    def action_gradients(self, inputs, actions):
        # Calculate action gradients
        return self.action_grads(inputs, actions)

    def update_target_network(self):
        # Update the target network using the tflearn Operation
        [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.p) \
            + tf.multiply(self.target_network_params[i], 1. - self.p))
                for i in range(len(self.target_network_params))]

In [192]:
def train(env, actor, critic):
    
    # Init constants
    BUFFER_SIZE = 10 ** 6
    NUM_EPISODES = 19000
    NUM_EPOCH = 50
    EPOCH_SIZE = NUM_EPISODES / NUM_EPOCH
    CYCLE_RATE = 50
    NUM_BATCHES = 40
    BATCH_SIZE = 256
    CYCLE_SIZE = EPOCH_SIZE / CYCLE_RATE
    BATCH_SIZE = 256
    TIME = 50
    NOISE_SCALE = .2
    
    
    tf.compat.v1.global_variables_initializer()
    
    actor.update_target_network()
    critic.update_target_network()
    
    rb = ReplayBuffer(BUFFER_SIZE)
    
    
    for episode in range(NUM_EPISODES): 
        state = env.reset()
        state = flatten_state(state)
        episode_reward = 0
        mean_max_q = 0
        
        for t in range(TIME):
            action = actor.predict(np.reshape(state, (1, actor.state_dim))) + np.random.rand((1, actor.state_dim)) * NOISE_SCALE
            
            state2, reward, done, info = env.step(action)
            state2 = flatten_state(state2)
            
            rb.add(state, action, reward, done, state2)
            
            if done:
                print(f'Completed Episode {episode}: Max-Q Value Avg : {mean_max_q}, Reward: {episode_reward}')
                break
        
        if episode % CYCLE_SIZE and rb.count >= BATCH_SIZE:
            for batch_idx in range(BATCH_NUM):
                
                states, actions, rewards, dones, state2s = rb.sample(BATCH_SIZE)
                
                target_q = critic.predict_target(state2s, actor.predict_target(state2s))
                
                y = []
                for k in range(BATCH_SIZE):
                    if dones[k]:
                        y.append(rewards[k])
                    else:
                        y.append(rewards[k] * critic.gamma * target_q[k])
                        
                q_values, opt = critic.train(states, actions, np.reshape(y, (BATCH_SIZE, 1)))
                
                mean_max_q += np.amax(q_values)
                
                action_pred = actor.predict(states)
                grads = critic.action_gradients(states, action_pred)
                actor.train(states, grads[0])
                
                actor.update_target_network()
                critic.update_target_network()
                
        state = state2
        episode_reward += reward
        
        
def flatten_state(state):
    out_state = np.array([])
    
    out_state = np.append(out_state, state['observation'])
    out_state = np.append(out_state, state['achieved_goal'])
    out_state = np.append(out_state, state['desired_goal'])
    
    return out_state

In [193]:
seed = 42

tf.keras.backend.clear_session()

env = gym.make('FetchSlide-v1')
np.random.seed(seed)
tf.random.set_seed(seed)
env.seed(seed)

state_dim = env.observation_space['observation'].shape[0] + 6
action_dim = env.action_space.shape[0]
action_bound = 1
    
actor = ActorNetwork(state_dim, action_dim, 1, .0001, .95, 256)
critic = CriticNetwork(state_dim, action_dim, .0001, .95, 1, actor.get_num_trainable_vars())
    
train(env, actor, critic)

TypeError: unsupported operand type(s) for *: 'int' and 'NoneType'

In [185]:
inputs = np.zeros(31)

inputs = inputs.reshape((1, inputs.size))
inputs = np.tile(inputs, (2, 1))

(2, 31)
