### Initialization

In [12]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Shareddrives/Duong-LongWarwick/FARL/StateAbstraction/hierarchical_DQN
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Shareddrives/Duong-LongWarwick/FARL/StateAbstraction/hierarchical_DQN
agents	       experiment_logs		       README.md
clustering.py  hierarchicalRL.ipynb	       results
clusters       hierarchicalRL_reference.ipynb  train_dqn.py
clusters_6     make_plots.py


In [13]:
!pip install tensorflow==1.15

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import 

In [14]:
import os

import matplotlib
matplotlib.use('Agg')

# import clustering
# import dqn
import gym
from gym.wrappers import Monitor

import matplotlib.pyplot as plt
import numpy as np
import random
import pickle
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
import sys
import os

import tensorflow as tf

### Agent

#### replay_buffer.py

In [15]:
class ReplayBuffer(object):
    def __init__(self, max_size, init_size, batch_size):
        self.max_size = max_size
        self.init_size = init_size
        self.batch_size = batch_size

        self.states = np.array([None] * self.max_size)
        self.actions = np.array([None] * self.max_size)
        self.rewards = np.array([None] * self.max_size)
        self.next_states = np.array([None] * self.max_size)
        self.terminals = np.array([None] * self.max_size)

        self.curr_pointer = 0
        self.curr_size = 0

    def add(self, state, action, reward, next_state, terminal):
        self.states[self.curr_pointer] = np.squeeze(state)
        self.actions[self.curr_pointer] = action
        self.rewards[self.curr_pointer] = reward
        self.next_states[self.curr_pointer] = np.squeeze(next_state)
        self.terminals[self.curr_pointer] = terminal

        self.curr_pointer += 1
        self.curr_size = min(self.max_size, self.curr_size + 1)
        # If replay buffer is full, set current pointer to be at the beginning of the buffer.
        if self.curr_pointer >= self.max_size:
            self.curr_pointer -= self.max_size

    def sample(self):
        if self.curr_size < self.init_size:
            return [], [], [], [], []
        sample_indices = []

        # Ensure that the most recent transition is in the returned batch.
        sample_indices.append(self.curr_pointer - 1)
        for i in range(self.batch_size - 1):
            sample_indices.append(random.randint(0, self.curr_size - 1))

        returned_states = []
        returned_actions = []
        returned_rewards = []
        returned_next_states = []
        returned_terminals = []

        for i in range(len(sample_indices)):
            index = sample_indices[i]
            returned_states.append(self.states[index])
            returned_actions.append(self.actions[index])
            returned_rewards.append(self.rewards[index])
            returned_next_states.append(self.next_states[index])
            returned_terminals.append(self.terminals[index])

        return np.array(returned_states), np.array(returned_actions), np.array(
            returned_rewards), np.array(returned_next_states), np.array(returned_terminals)
        # return self.states[sample_indices], self.actions[sample_indices], self.rewards[sample_indices], self.next_states[sample_indices], self.terminals[sample_indices]


#### dqn.py

In [28]:
class DqnAgent(object):

    # Discount factor for future rewards.
    DISCOUNT = 0.99
    # Max size of the replay buffer.
    REPLAY_MEMORY_SIZE = 500000
    # Batch size for updates from the replay buffer.
    BATCH_SIZE = 32
    # Initial size of replay memory prior to beginning sampling batches.
    REPLAY_MEMORY_INIT_SIZE = 5000
    # Update the target network every TARGET_UPDATE timesteps.
    TARGET_UPDATE = 1000 #10000

    def __init__(self, sess=None, learning_rate=0.00025, state_dims=[], num_actions=0,
        epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=50000, replay_memory_init_size=None,
        target_update=None):

        self._learning_rate = learning_rate
        self._state_dims = state_dims
        self._num_actions = num_actions

        self._epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
        self._epsilon_decay_steps = epsilon_decay_steps

        if replay_memory_init_size is not None:
            self.REPLAY_MEMORY_INIT_SIZE = replay_memory_init_size

        if target_update is not None:
            self.TARGET_UPDATE = target_update

        self._replay_buffer = ReplayBuffer(
            self.REPLAY_MEMORY_SIZE,
            self.REPLAY_MEMORY_INIT_SIZE,
            self.BATCH_SIZE)

        self._current_time_step = 0

        with tf.Graph().as_default():
            self._construct_graph()
            self._saver = tf.train.Saver()
            if sess is None:
                self.sess = tf.Session()
            else:
                self.sess = sess
            self.sess.run(tf.global_variables_initializer())

    def _q_network(self, state):
        print("Controller!")
        # Three convolutional layers
        # conv1 = tf.contrib.layers.conv2d(
        #    state, 32, 8, 4, activation_fn=tf.nn.relu)
        # conv2 = tf.contrib.layers.conv2d(
        #    conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        # conv3 = tf.contrib.layers.conv2d(
        #    conv2, 64, 3, 1, activation_fn=tf.nn.relu)

        layer1 = tf.contrib.layers.fully_connected(state, 64, activation_fn=tf.nn.relu)
        # layer2 = tf.contrib.layers.fully_connected(layer1, 100, activation_fn=tf.nn.sigmoid)
        # layer3 = tf.contrib.layers.fully_connected(layer2, 100, activation_fn=tf.nn.relu)
        q_values = tf.contrib.layers.fully_connected(layer1, self._num_actions, activation_fn=None)

        # Fully connected layers
        # flattened = tf.contrib.layers.flatten(conv3)
        # fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        # q_values = tf.contrib.layers.fully_connected(fc1, self._num_actions)

        return q_values

    def _construct_graph(self):
        shape=[None]
        for dim in self._state_dims:
            shape.append(dim)
        self._state = tf.placeholder(shape=shape, dtype=tf.float32)

        with tf.variable_scope('q_network'):
            self._q_values = self._q_network(self._state)
        with tf.variable_scope('target_q_network'):
            self._target_q_values = self._q_network(self._state)
        with tf.variable_scope('q_network_update'):
            self._picked_actions = tf.placeholder(shape=[None, 2], dtype=tf.int32)
            self._td_targets = tf.placeholder(shape=[None], dtype=tf.float32)
            self._q_values_pred = tf.gather_nd(self._q_values, self._picked_actions)
            # self._losses = tf.square(self._q_values_pred, self._td_targets)
            self._losses = clipped_error(self._q_values_pred - self._td_targets)
            self._loss = tf.reduce_mean(self._losses)

            self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate)
            # self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate, 0.99, 0.0, 1e-6)
            # self.optimizer = tf.train.AdamOptimizer(0.0001)
            # self.optimizer = tf.train.GradientDescentOptimizer(0.1)
            grads_and_vars = self.optimizer.compute_gradients(self._loss, tf.trainable_variables())
            grads = [gv[0] for gv in grads_and_vars]
            params = [gv[1] for gv in grads_and_vars]

            grads = tf.clip_by_global_norm(grads, 5.0)[0]

            # clipped_grads_and_vars = [(
            #    tf.clip_by_norm(grad, 5.0), var) for grad, var in grads_and_vars]
            clipped_grads_and_vars = list(zip(grads, params))
            print(clipped_grads_and_vars)
            self.train_op = self.optimizer.apply_gradients(clipped_grads_and_vars,
                global_step=tf.contrib.framework.get_global_step())
            print("4")
            # self.train_op = self.optimizer.minimize(self._loss,
            #    global_step=tf.contrib.framework.get_global_step())
        with tf.name_scope('target_network_update'):
            q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
                'q_network')]
            q_network_params = sorted(q_network_params, key=lambda v: v.name)

            target_q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
                'target_q_network')]
            target_q_network_params = sorted(target_q_network_params, key=lambda v: v.name)

            self.target_update_ops = []
            for e1_v, e2_v in zip(q_network_params, target_q_network_params):
                op = e2_v.assign(e1_v)
                self.target_update_ops.append(op)

    def sample(self, state):
        self._current_time_step += 1
        q_values = self.sess.run(self._q_values, {self._state: state})

        epsilon = self._epsilons[min(self._current_time_step, self._epsilon_decay_steps - 1)]

        e = random.random()
        if e < epsilon:
            return random.randint(0, self._num_actions - 1)
        else:
            return np.argmax(q_values)

    def best_action(self, state):
        q_values = self.sess.run(self._q_values, {self._state: state})
        return np.argmax(q_values)

    def store(self, state, action, reward, next_state, terminal, eval=False, curr_reward=False):
        if not eval:
            self._replay_buffer.add(state, action, reward, next_state, terminal)

    def update(self):
        states, actions, rewards, next_states, terminals = self._replay_buffer.sample()
        '''
        print "Update!"
        print states
        print actions
        print rewards
        print terminals
        print ""
        '''

        actions = np.array(list(zip(np.arange(len(actions)), actions)))

        if len(states) > 0:
            next_states_q_values = self.sess.run(self._target_q_values, {self._state: next_states})

            # print "Next States Q Values:"
            # print next_states_q_values

            next_states_max_q_values = np.max(next_states_q_values, axis=1)

            td_targets = rewards + (1 - terminals) * self.DISCOUNT * next_states_max_q_values

            feed_dict = {self._state: states,
                         self._picked_actions: actions,
                         self._td_targets: td_targets}

            _ = self.sess.run(self.train_op, feed_dict=feed_dict)

        # Update the target q-network.
        if not self._current_time_step % self.TARGET_UPDATE:
            # print self._current_time_step
            # print self._epsilons[min(self._current_time_step, self._epsilon_decay_steps - 1)]
            # print "Updating target!"
            self.sess.run(self.target_update_ops)

def clipped_error(x):
    return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5)

def compute_gradients(tensor, var_list):
  grads = tf.gradients(tensor, var_list)
  return [grad if grad is not None else tf.zeros_like(var)
          for var, grad in zip(var_list, grads)]

#### Qlearning.py

In [17]:
class QLearningAgent(object):
    """Implementation of tabular Q-learning."""

    DISCOUNT = 0.95

    def __init__(self, num_states, num_actions, learning_rate, epsilon=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.q_table = np.zeros((self.num_states, self.num_actions))
        self.curr_transition = None
        self.epsilon_decay_steps = 5000
        self.epsilons = np.linspace(1.0, 0.01, self.epsilon_decay_steps)
        self.curr_time_step = 0

    def get_avaiable_actions(self, state):
        state_index = np.where(np.squeeze(state) == 1)[0][0]
        available_actions = []
        for i in range(len(state)):
            if i != state_index:
                available_actions.append(i)

        return available_actions

    def compute_state_index(self, state):
        if np.sum(state) == 1:
            state_index = np.where(np.squeeze(state) == 1)[0][0]
            return state_index
        else:
            # State vector contains an extra bit at the end.
            state_index = np.where(np.squeeze(state) == 1)[0][0]
            return self.num_actions + state_index

    def sample(self, state):
        state_index = self.compute_state_index(state)
        q_values = self.q_table[state_index]
        self.curr_time_step += 1
        e = self.epsilons[min(self.curr_time_step, self.epsilon_decay_steps - 1)]
        e = random.random()
        if e < self.epsilon:
            return random.randint(0, self.num_actions - 1)
        else:
            return np.argmax(q_values)

    def best_action(self, state):
        state_index = self.compute_state_index(state)
        q_values = self.q_table[state_index]

        return np.argmax(q_values)

    def store(self, state, action, reward, next_state, terminal, eval, curr_reward):
        if not eval:
            self.curr_transition = [state, action, reward, next_state, terminal]
            self.curr_reward = curr_reward

    def update(self):
        state = self.curr_transition[0]
        action = self.curr_transition[1]
        reward = self.curr_transition[2]
        next_state = self.curr_transition[3]
        terminal = self.curr_transition[4]

        state_index = self.compute_state_index(state)
        next_state_index = self.compute_state_index(next_state)

        td_target = reward + (1 - terminal) * self.DISCOUNT * np.max(self.q_table[next_state_index])

        if self.curr_reward >= 1:
            print("Updating!")
            print(state)
            print(action)
            print(reward)
            print(next_state)
            print(terminal)
            print(td_target)
            print("")

        self.q_table[state_index, action] = (
            1 - self.learning_rate) * self.q_table[state_index, action] + self.learning_rate * td_target

#### lstm_dqn

In [19]:
class LstmDqnAgent(DqnAgent):
    """Implementation of DQN with an RNN for the q-network."""

    def __init__(self, sequence_length=0, *args, **kwargs):
        self._sequence_length = sequence_length
        super(LstmDqnAgent, self).__init__(*args, **kwargs)

    def _q_network(self, state):
        embeddings = tf.get_variable('embeddings',
            [self._num_actions + 1, 128])

        embedded_ids = tf.gather(embeddings, state)

        lstm = tf.contrib.rnn.BasicLSTMCell(128)

        cell_state, hidden_state = tf.nn.dynamic_rnn(
            cell=lstm, inputs=embedded_ids, dtype=tf.float32)

        q_values = tf.contrib.layers.fully_connected(
            hidden_state[1], self._num_actions, activation_fn=None)

        return q_values

    def _construct_graph(self):
        shape=[None]
        for dim in self._state_dims:
            shape.append(dim)
        self._state = tf.placeholder(shape=shape, dtype=tf.int32)

        with tf.variable_scope('q_network'):
            self._q_values = self._q_network(self._state)
        with tf.variable_scope('target_q_network'):
            self._target_q_values = self._q_network(self._state)
        with tf.variable_scope('q_network_update'):
            self._picked_actions = tf.placeholder(shape=[None, 2], dtype=tf.int32)
            self._td_targets = tf.placeholder(shape=[None], dtype=tf.float32)
            self._q_values_pred = tf.gather_nd(self._q_values, self._picked_actions)
            self._losses = clipped_error(self._q_values_pred - self._td_targets)
            self._loss = tf.reduce_mean(self._losses)

            self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate)
            # self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate, 0.99, 0.0, 1e-6)
            # self.optimizer = tf.train.AdamOptimizer(0.0001)
            # self.optimizer = tf.train.GradientDescentOptimizer(0.1)

            grads_and_vars = self.optimizer.compute_gradients(self._loss, tf.trainable_variables())

            grads = [gv[0] for gv in grads_and_vars]
            params = [gv[1] for gv in grads_and_vars]

            grads = tf.clip_by_global_norm(grads, 5.0)[0]
            clipped_grads_and_vars = np.array(list(zip(grads, params)))
            self.train_op = self.optimizer.apply_gradients(clipped_grads_and_vars,
                global_step=tf.contrib.framework.get_global_step())

        with tf.name_scope('target_network_update'):
            q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
                'q_network')]
            q_network_params = sorted(q_network_params, key=lambda v: v.name)

            target_q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
                'target_q_network')]
            target_q_network_params = sorted(target_q_network_params, key=lambda v: v.name)

            self.target_update_ops = []
            for e1_v, e2_v in zip(q_network_params, target_q_network_params):
                op = e2_v.assign(e1_v)
                self.target_update_ops.append(op)

#### hierarchical_dqn.py

In [20]:
class HierarchicalDqnAgent(object):
    INTRINSIC_STEP_COST = -1    # Step cost for the controller.

    INTRINSIC_TIME_OUT = 50             # Number of steps after which intrinsic episode ends.
    INTRINSIC_TIME_OUT_PENALTY = -10    # Penalty given to controller for timing out episode.

    ARTIFICIAL_PENALTY = -100   # Penalty given to the meta-controller for telling the
                                # agent to go to the same cluster it is already in.
    EXTRA_TRAVEL_PENALTY = -1   # Penalty given to meta-controller if controller agent
                                # travels through additional clusters to get to target cluster.
    PRETRAIN_EPISODES = 100

    def __init__(self,
                 learning_rates=[0.1, 0.00025],
                 state_sizes=[0, 0],
                 agent_types=['network', 'network'],
                 subgoals=None,
                 num_subgoals=0,
                 num_primitive_actions=0,
                 meta_controller_state_fn=None,
                 check_subgoal_fn=None,
                 use_extra_travel_penalty=False,
                 use_extra_bit_for_subgoal_center=False,
                 use_controller_dqn=False,
                 use_intrinsic_timeout=False,
                 use_memory=False,
                 memory_size=0,
                 pretrain_controller=False):
        print("h-DQN")
        print("Use extra travel penalty:")
        print(use_extra_travel_penalty)
        print("Use extra bit for subgoal center:")
        print(use_extra_bit_for_subgoal_center)
        print("Use controller dqn:")
        print(use_controller_dqn)
        print("Use intrinsic timeout:")
        print(use_intrinsic_timeout)
        print("Use memory:")
        print(use_memory)
        print("Memory size:")
        print(memory_size)
        print("Pretrain Controller:")
        print(pretrain_controller)
        """Initializes a hierarchical DQN agent.
           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
            agent_types: type of each agent - either tabular QLearning agent or Deep Q Network.
            subgoals: array of subgoals for the meta-controller.
            num_subgoals: the action space of the meta-controller.
            num_primitive_actions: the action space of the controller.
            meta_controller_state_fn: function that returns the state of the meta-controller.
            check_subgoal_fn: function that checks if agent has satisfied a particular subgoal.
            use_extra_travel_penalty: whether or not to penalize the meta-controller for bad instructions.
            use_extra_bit_for_subgoal_center: whether or not to use an extra bit to indicate whether
                                              agent is at center of a particular cluster.
            use_controller_dqn: whether to use regular dqn or controller dqn for the controller.
            use_intrinsic_timeout: whether or not to intrinsically timeout the controller.
        """
        if not use_extra_travel_penalty:
            self.EXTRA_TRAVEL_PENALTY = 0

        if use_extra_bit_for_subgoal_center:
            self.ARTIFICIAL_PENALTY = 0
            state_sizes[0] = state_sizes[0] * 2

        if not pretrain_controller:
            self.PRETRAIN_EPISODES = 0

        if use_memory:
            print("Decaying meta-controller epsilon faster!")
            self._meta_controller = LstmDqnAgent(num_actions=num_subgoals,
                                                 state_dims=[memory_size],
                                                 sequence_length=memory_size,
                                                 replay_memory_init_size=100,
                                                 target_update=100,
                                                 epsilon_end=0.01,
                                                 epsilon_decay_steps=5000)
        else:
            self._meta_controller = QLearningAgent(num_states=state_sizes[0],
                                                   num_actions=num_subgoals,
                                                   learning_rate=learning_rates[0],
                                                   epsilon=0.1)
        if use_controller_dqn:
            self._controller = ControllerDqnAgent(learning_rate=learning_rates[1],
                num_actions=num_primitive_actions,
                state_dims=state_sizes[1],
                subgoal_dims=[num_subgoals])
        else:
            print("Epsilon end for controller is 0.01!")
            self._controller = DqnAgent(learning_rate=learning_rates[1],
                num_actions=num_primitive_actions,
                state_dims=[state_sizes[1][0] + num_subgoals],
                epsilon_end=0.01) # CHANGED

        self._subgoals = subgoals
        self._num_subgoals = num_subgoals

        self._meta_controller_state_fn = meta_controller_state_fn
        self._check_subgoal_fn = check_subgoal_fn

        self._use_extra_bit_for_subgoal_center = use_extra_bit_for_subgoal_center
        self._use_controller_dqn = use_controller_dqn

        self._use_intrinsic_timeout = use_intrinsic_timeout

        self._use_memory = use_memory
        self._memory_size = memory_size


        self._meta_controller_state = None
        self._curr_subgoal = None
        self._meta_controller_reward = 0
        self._intermediate_clusters = []
        self._intermediate_dict = defaultdict(int)
        self._intermediate_clusters_dict = defaultdict(int)
        self._history = [0 for i in range(self._memory_size)]

        # Only used if use_extra_bit_for_subgoal_center is True.
        self._original_state = None

        self._next_meta_controller_state = None

        self._intrinsic_time_step = 0

        self._episode = 0

    def update_history(self, state):
        returned_state = state
        if self._meta_controller_state_fn:
            returned_state = self._meta_controller_state_fn(state, self._original_state)

        current_cluster_id = np.where(np.squeeze(returned_state) == 1)[0][0] + 1
        new_history = self._history[1:]

        # print("History update!")
        # print(self._history)
        # print(new_history)
        # print(current_cluster_id)
        new_history.append(current_cluster_id)
        # print(new_history)
        # print("")
        self._history = new_history

    def get_meta_controller_state(self, state):
        returned_state = state
        if self._meta_controller_state_fn:
            returned_state = self._meta_controller_state_fn(state, self._original_state)

        if self._use_memory:
            returned_state = self._history[:]

        return returned_state

    def get_controller_state(self, state, subgoal_index):
        curr_subgoal = self._subgoals[subgoal_index]

        # Concatenate the environment state with the subgoal.
        controller_state = list(state[0])
        for i in range(len(curr_subgoal)):
            controller_state.append(curr_subgoal[i])
        controller_state = np.array([controller_state])
        # print(controller_state)
        return np.copy(controller_state)

    def intrinsic_reward(self, state, subgoal_index):
        if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT:
            return self.INTRINSIC_TIME_OUT_PENALTY
        if self.subgoal_completed(state, subgoal_index):
            return 1
        else:
            return self.INTRINSIC_STEP_COST

    def subgoal_completed(self, state, subgoal_index):
        if self._check_subgoal_fn is None:
            if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT:
                return True
            return state == self._subgoals[subgoal_index]
        else:
            if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT:
                return True

            if not self._use_memory and self._meta_controller_state[self._curr_subgoal] == 1:
                if np.sum(self._meta_controller_state) > 1:
                    return False

                return self._check_subgoal_fn(state, subgoal_index, self._original_state)
            else:
                return self._check_subgoal_fn(state, subgoal_index)

    def store(self, state, action, reward, next_state, terminal, eval=False):
        """Stores the current transition in replay memory.
           The transition is stored in the replay memory of the controller.
           If the transition culminates in a subgoal's completion or a terminal state, a
           transition for the meta-controller is constructed and stored in its replay buffer.
           Args:
            state: current state
            action: primitive action taken
            reward: reward received from state-action pair
            next_state: next state
            terminal: extrinsic terminal (True or False)
            eval: Whether the current episode is a train or eval episode.
        """

        self._meta_controller_reward += reward
        self._intrinsic_time_step += 1

        # Compute the controller state, reward, next state, and terminal.
        intrinsic_state = self.get_controller_state(state, self._curr_subgoal)
        intrinsic_next_state = self.get_controller_state(next_state, self._curr_subgoal)
        intrinsic_reward = self.intrinsic_reward(next_state, self._curr_subgoal)
        subgoal_completed = self.subgoal_completed(next_state, self._curr_subgoal)
        intrinsic_terminal = subgoal_completed or terminal

        self._controller.store(np.copy(intrinsic_state), action,
            intrinsic_reward, np.copy(intrinsic_next_state), intrinsic_terminal, eval)

        # Check for intermediate state.
        intermediate_meta_controller_state = self.get_meta_controller_state(next_state)

        if not self._use_memory:
            intermediate_cluster_id = np.where(np.squeeze(intermediate_meta_controller_state) == 1)[0][0]
        else:
            intermediate_cluster_id = intermediate_meta_controller_state[-1] - 1

        self._intermediate_dict[intermediate_cluster_id] += 1
        # Agent is traveling through a cluster that is not the starting or ending cluster.
        # FIX THIS!!!!
        if list(intermediate_meta_controller_state[0:self._num_subgoals]) != list(
            self._meta_controller_state[0:self._num_subgoals]) and not subgoal_completed:
            self._meta_controller_reward += self.EXTRA_TRAVEL_PENALTY


            self._intermediate_clusters.append(intermediate_cluster_id)
            self._intermediate_clusters_dict[intermediate_cluster_id] += 1

        if terminal and not eval:
            self._episode += 1

        if subgoal_completed or terminal:
            # Normalize the meta-controller reward.
            self._meta_controller_reward /= 100.0

            meta_controller_state = np.copy(self._meta_controller_state)
            if not self._use_memory:
                next_meta_controller_state = self.get_meta_controller_state(next_state)
            else:
                returned_state = self._meta_controller_state_fn(next_state, self._original_state)
                current_cluster_id = np.where(np.squeeze(returned_state) == 1)[0][0] + 1
                new_history = self._history[1:]
                new_history.append(current_cluster_id)
                next_meta_controller_state = new_history

            if self._episode >= self.PRETRAIN_EPISODES:
                self._meta_controller.store(np.copy(meta_controller_state), self._curr_subgoal,
                    self._meta_controller_reward, np.copy(next_meta_controller_state),
                    terminal, eval, reward)

            if eval:
                if subgoal_completed:
                    print("Subgoal completed!")
                    print("Intermediate Clusters:")
                    print(self._intermediate_clusters)
                    print("Intermediate Cluster Count:")
                    print(self._intermediate_dict)
                    print("Intermediate non-beginning cluster count:")
                    print(self._intermediate_clusters_dict)
                    print("State:")
                    print(next_state)
                    print("Meta-Controller reward:")
                    print(self._meta_controller_reward)
                    print("Intrinsic reward:")
                    print(intrinsic_reward)
                    print("Cluster:")
                    print(next_meta_controller_state)
                    print("")
                    print("")
                else:
                    print("Terminal!")
                    print("Intermediate clusters:")
                    print(self._intermediate_clusters)
                    print("Intermediate cluster count:")
                    print(self._intermediate_dict)
                    print("Intermediate non-beginning cluster count:")
                    print(self._intermediate_clusters_dict)
                    print("State:")
                    print(next_state)
                    print("Meta-Controller reward:")
                    print(self._meta_controller_reward)
                    print("Intrinsic reward:")
                    print(intrinsic_reward)
                    print("Cluster:")
                    print(next_meta_controller_state)
                    print("")
                    print("")

            # Reset the current meta-controller state and current subgoal to be None
            # since the current subgoal is finished. Also reset the meta-controller's reward.
            self._next_meta_controller_state = np.copy(next_meta_controller_state)

            if terminal:
                self._next_meta_controller_state = None

            self._meta_controller_state = None
            self._curr_subgoal = None
            self._meta_controller_reward = 0

            self._intermediate_clusters = []
            self._intermediate_dict = defaultdict(int)
            self._intermediate_clusters_dict = defaultdict(int)

            self._original_state = None
            self._intrinsic_time_step = 0

            if terminal:
                self._history = [0 for i in range(self._memory_size)]

    def sample(self, state):
        """Samples an action from the hierarchical DQN agent.
           Samples a subgoal if necessary from the meta-controller and samples a primitive action
           from the controller.
           Args:
            state: the current environment state.
           Returns:
            action: a primitive action.
        """
        if self._meta_controller_state is None:
            if self._use_memory:
                self.update_history(state)

            if self._next_meta_controller_state is not None and not self._use_memory:
                self._meta_controller_state = self._next_meta_controller_state
            else:
                self._meta_controller_state = self.get_meta_controller_state(state)

            self._curr_subgoal = self._meta_controller.sample([self._meta_controller_state])

            # Artificially penalize the meta-controller for picking the subgoal to
            # be the same as the current cluster.
            if self._use_memory:
                same_cluster_instruction = (self._meta_controller_state[-1] - 1) == self._curr_subgoal
            else:
                same_cluster_instruction = self._meta_controller_state[self._curr_subgoal] == 1

            if same_cluster_instruction:
                self._meta_controller_reward = self.ARTIFICIAL_PENALTY
                self._original_state = state

        controller_state = self.get_controller_state(state, self._curr_subgoal)
        action = self._controller.sample(controller_state)

        return action

    def best_action(self, state):
        """Returns the greedy action from the hierarchical DQN agent.
           Gets the greedy subgoal if necessary from the meta-controller and gets
           the greedy primitive action from the controller.
           Args:
            state: the current environment state.
           Returns:
            action: the controller's greedy primitive action.
        """
        returned_info = None

        if self._meta_controller_state is None:
            if self._use_memory:
                self.update_history(state)

            if self._next_meta_controller_state is not None and not self._use_memory:
                self._meta_controller_state = self._next_meta_controller_state
            else:
                self._meta_controller_state = self.get_meta_controller_state(state)

            self._curr_subgoal = self._meta_controller.best_action([self._meta_controller_state])

            returned_info = [self._meta_controller_state, self._curr_subgoal]

            # Artificially penalize the meta-controller for picking the subgoal to
            # be the same as the current cluster.
            if self._use_memory:
                same_cluster_instruction = (self._meta_controller_state[-1] - 1) == self._curr_subgoal
            else:
                same_cluster_instruction = self._meta_controller_state[self._curr_subgoal] == 1

            if same_cluster_instruction:
                self._meta_controller_reward = self.ARTIFICIAL_PENALTY
                self._original_state = state

            print("Current State:")
            print(state)
            print("Current Meta-Controller State:")
            print(self._meta_controller_state)
            print("Current subgoal picked:")
            print(self._curr_subgoal)

        controller_state = self.get_controller_state(state, self._curr_subgoal)
        action = self._controller.best_action(controller_state)
        return action, returned_info

    def update(self):
        self._controller.update()
        # Only update meta-controller right after a meta-controller transition has taken place,
        # which occurs only when either a subgoal has been completed or the agnent has reached a
        # terminal state.
        if self._meta_controller_state is None:
            self._meta_controller.update()

#### controller_dqn.py

In [21]:
class ControllerDqnAgent(DqnAgent):
    def __init__(self, subgoal_dims=[], *args, **kwargs):
        self._subgoal_dims = subgoal_dims
        super(ControllerDqnAgent, self).__init__(*args, **kwargs)

    def _q_network(self, state, subgoal):
        state_layer1 = tf.contrib.layers.fully_connected(state, 64, activation_fn=tf.nn.relu)
        subgoal_layer1 = tf.contrib.layers.fully_connected(subgoal, 64, activation_fn=tf.nn.relu)

        layer1 = tf.concat([state_layer1, subgoal_layer1], axis=1)

        q_values = tf.contrib.layers.fully_connected(layer1, self._num_actions, activation_fn=None)

        return q_values

    def _construct_graph(self):
        # state_shape=[None]
        # subgoal_shape=[None]
        # for dim in self._state_dims:
        #    state_shape.append(dim)
        # for dim in self._subgoal_dims:
        #    subgoal_shape.append(dim)
        state_shape = self._state_dims[0]
        subgoal_shape = self._subgoal_dims[0]

        self._state = tf.placeholder(shape=[None, state_shape + subgoal_shape],
            dtype=tf.float32)
        self._controller_state, self._subgoal = tf.split(
            self._state, [state_shape, subgoal_shape], axis=1)

        with tf.variable_scope('q_network'):
            self._q_values = self._q_network(self._controller_state, self._subgoal)
        with tf.variable_scope('target_q_network'):
            self._target_q_values = self._q_network(self._controller_state, self._subgoal)
        with tf.variable_scope('q_network_update'):
            self._picked_actions = tf.placeholder(shape=[None, 2], dtype=tf.int32)
            self._td_targets = tf.placeholder(shape=[None], dtype=tf.float32)

            self._q_values_pred = tf.gather_nd(self._q_values, self._picked_actions)

            self._losses = clipped_error(self._q_values_pred - self._td_targets)
            self._loss = tf.reduce_mean(self._losses)

            self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate)
            # self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate, 0.99, 0.0, 1e-6)
            # self.optimizer = tf.train.AdamOptimizer(0.0001)
            # self.optimizer = tf.train.GradientDescentOptimizer(0.1)

            grads_and_vars = self.optimizer.compute_gradients(self._loss, tf.trainable_variables())

            grads = [gv[0] for gv in grads_and_vars]
            params = [gv[1] for gv in grads_and_vars]

            grads = tf.clip_by_global_norm(grads, 5.0)[0]

            # clipped_grads_and_vars = [(
            #    tf.clip_by_norm(grad, 5.0), var) for grad, var in grads_and_vars]
            clipped_grads_and_vars = np.array(list(zip(grads, params)))
            self.train_op = self.optimizer.apply_gradients(clipped_grads_and_vars,
                global_step=tf.contrib.framework.get_global_step())

            # self.train_op = self.optimizer.minimize(self._loss,
            #    global_step=tf.contrib.framework.get_global_step())
        with tf.name_scope('target_network_update'):
            q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
                'q_network')]
            q_network_params = sorted(q_network_params, key=lambda v: v.name)

            target_q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
                'target_q_network')]
            target_q_network_params = sorted(target_q_network_params, key=lambda v: v.name)

            self.target_update_ops = []
            for e1_v, e2_v in zip(q_network_params, target_q_network_params):
                op = e2_v.assign(e1_v)
                self.target_update_ops.append(op)

### Functional

#### clustering.py

In [22]:
def make_clusters(env_name, n_clusters):
    env = gym.make(env_name)
    env.reset()
    VALID_ACTIONS = list(range(env.action_space.n))
    data = []

    for episode in range(1000):
        state = env.reset()
        done = False
        step_count = 0
        while not done:
            step_count += 1
            action = random.randint(0, len(VALID_ACTIONS) - 1)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            state_with_reward = state[:]
            state_with_reward = np.append(state_with_reward, reward)
            data.append(state_with_reward)

    data = np.array(data)
    x_pos_normalized = (data[:, 0] - np.mean(data[:, 0])) / np.std(data[:, 0])
    velocity_normalized = (data[:, 1] - np.mean(data[:, 1])) / np.std(data[:, 1])
    # reward_normalized = (data[:, 2] - np.mean(data[:, 2])) / np.std(data[:, 2])
    # data_normalized = zip(x_pos_normalized, velocity_normalized, reward_normalized)
    data_normalized = np.array(list(zip(x_pos_normalized, velocity_normalized)))
    # print(data_normalized)
    estimator = KMeans(n_clusters=n_clusters)
    estimator.fit(data_normalized)
    means = [np.mean(data[:, 0]), np.mean(data[:, 1])]
    stds = [np.std(data[:, 0]), np.std(data[:, 1])]
    cluster_centers = estimator.cluster_centers_[:,0:2]
    for i in range(len(cluster_centers)):
        cluster_centers[i][0] = cluster_centers[i][0] * stds[0] + means[0]
        cluster_centers[i][1] = cluster_centers[i][1] * stds[1] + means[1]
    labels = estimator.labels_
    labels = labels.astype(np.int32)
    colors = ['red', 'green', 'blue', 'orange',
    'yellow', 'magenta', 'black',
    'purple', 'brown', 'white']
    fig, ax = plt.subplots()
    for i in range(n_clusters):
        label = i
        color = colors[i%len(colors)]
        indices_of_labels = np.where(labels==label)
        ax.scatter(data[indices_of_labels,0][0], data[indices_of_labels,1][0], c=color,
            label=int(label), alpha=0.5)
    ax.legend()
    plt.xlabel('X Position')
    plt.ylabel('Velocity')
    try:
        os.stat('clusters_' + str(n_clusters))
    except:
        os.mkdir('clusters_' + str(n_clusters))

    plt.savefig('clusters_' + str(n_clusters) + '/Clusters.png')
    print('9')

    returned_data = np.array(list(zip(data[:, 0], data[:, 1])))

    with open('clusters_' + str(n_clusters) + '/data', 'wb') as data_file:
        pickle.dump(returned_data, data_file)
    with open('clusters_' + str(n_clusters) + '/labels', 'wb') as labels_file:
        pickle.dump(labels, labels_file)
    with open('clusters_' + str(n_clusters) + '/cluster_centers', 'wb') as cluster_centers_file:
        pickle.dump(cluster_centers, cluster_centers_file)
    print('10')

    return returned_data, labels, cluster_centers


def get_cluster_fn(env_name='MountainCar-v0', n_clusters=10, extra_bit=True, load_from_dir=True):
    if load_from_dir:
        with open('clusters_' + str(n_clusters) + '/data', 'rb') as data_file:
            data = pickle.load(data_file)
        with open('clusters_' + str(n_clusters) + '/labels', 'rb') as labels_file:
            labels = pickle.load(labels_file)
        with open('clusters_' + str(n_clusters) + '/cluster_centers', 'rb') as cluster_centers_file:
            cluster_centers = pickle.load(cluster_centers_file)

    else:
        data, labels, cluster_centers = make_clusters(env_name, n_clusters)
        print('xxx')
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(data, labels)
    # Create one-hot representation of the clusters.
    clusters_one_hot = [np.zeros(n_clusters) for i in range(n_clusters)]
    for i in range(len(clusters_one_hot)):
        clusters_one_hot[i][i] = 1

    ratio = 0.5

    def check_cluster(data_point, cluster_index, original_point=None):
        # print("Check cluster function:")
        # print(cluster_index)
        if not extra_bit or original_point is None:
            predicted_cluster_index = neigh.predict(data_point)[0]
            data_point = np.squeeze(data_point)
            # Cheating for the goal cluster area!
            if data_point[0] >= 0.5:
                predicted_cluster_index = 5
            if data_point[0] < 0.5 and predicted_cluster_index == 5:
                predicted_cluster_index = 3 
            return cluster_index == predicted_cluster_index
        else:
            distance_to_boundary = euclidean_distance(data_point, original_point)
            distance_to_center = euclidean_distance(data_point, cluster_centers[cluster_index])
            return np.float(distance_to_center) / np.maximum(distance_to_boundary, np.exp(-10)) <= ratio


    def identify_cluster(data_point, original_point):
        cluster_index = neigh.predict(data_point)[0]
        data_point = np.squeeze(data_point)
        # Cheating for the goal cluster area!
        if data_point[0] >= 0.5:
            cluster_index = 5
        if data_point[0] < 0.5 and cluster_index == 5:
            cluster_index = 3

        if extra_bit:
            cluster_one_hot = np.zeros(n_clusters + 1)
        else:
            cluster_one_hot = np.zeros(n_clusters)
        cluster_one_hot[cluster_index] = 1

        if extra_bit:
            # Add bit that represents whether agent is on boundary or in center of cluster
            if original_point is not None:
                distance_to_boundary = euclidean_distance(data_point, original_point)
                distance_to_center = euclidean_distance(data_point, cluster_centers[cluster_index])
                if np.float(distance_to_center) / np.maximum(distance_to_boundary, np.exp(-10)) <= ratio:
                    cluster_one_hot[-1] = 1

        return cluster_one_hot

    return identify_cluster, check_cluster, n_clusters, np.array(clusters_one_hot)


def euclidean_distance(point1, point2):
    point1 = np.squeeze(point1)
    point2 = np.squeeze(point2)
    return np.sqrt(np.square(point1[0] - point2[0]) + np.square(point1[1] - point2[1]))

#### train.py

In [None]:
# tf.flags.DEFINE_string('agent_type', 'h_dqn', 'RL agent type.')
# tf.flags.DEFINE_integer('n_clusters', 6, 'Number of clusters to form in unsupervised training.')
# tf.flags.DEFINE_string('logdir', 'experiment_logs/Cheating_Epsilon_Decay_Faster/', 'Directory of logfile.')
# tf.flags.DEFINE_string('experiment_dir', '', 'Directory of experiment files.')
# tf.flags.DEFINE_string('logfile', 'log.txt', 'Name of the logfile.')
# tf.flags.DEFINE_string('env_name', 'MountainCar-v0', 'Name of the environment.')
# tf.flags.DEFINE_boolean('use_extra_travel_penalty', False, 'Whether or not to penalize meta-controller for sending agent to non-adjacent clusters.')
# tf.flags.DEFINE_boolean('use_extra_bit', False, 'Whether or not the meta-controller state contains an extra bit which indicates whether or not the agent is near the center of a particular cluster.')
# tf.flags.DEFINE_boolean('use_controller_dqn', False, 'Whether to use a controller dqn as opposed to normal dqn for the controller.')
# tf.flags.DEFINE_boolean('use_intrinsic_timeout', False, 'Whether or not to intrinsically timeout controller agent.')
# tf.flags.DEFINE_boolean('use_memory', False, 'Whether or not the meta-controller should use memory.')
# tf.flags.DEFINE_integer('memory_size', 5, 'Size of the LSTM memory.')
# tf.flags.DEFINE_boolean('pretrain_controller', False, 'Whether or not to pretrain the controller.')
# tf.flags.DEFINE_integer('run_number', 1, 'Run number.')

agent_type = "h_dqn"
n_clusters = 6
logdir = 'experiment_logs/Cheating_Epsilon_Decay_Faster/'
experiment_dir = ""
logfile = "log.txt"
env_name = 'MountainCar-v0'
use_extra_travel_penalty = False
use_extra_bit = False
use_controller_dqn = False
use_intrinsic_timeout = False
use_memory = False
memory_size = 5
pretrain_controller = False
run_number = 1

env_name = ''

# FLAGS = tf.flags.FLAGS

def log(logfile, iteration, rewards):
    """Function that logs the reward statistics obtained by the agent.
    Args:
        logfile: File to log reward statistics.
        iteration: The current iteration.
        rewards: Array of rewards obtained in the current iteration.
    """
    log_string = '{} {} {} {}'.format(
        iteration, np.min(rewards), np.mean(rewards), np.max(rewards))
    print(log_string)

    with open(logfile, 'a') as f:
        f.write(log_string + '\n')

def make_environment(env_name):
    return gym.make(env_name)

def make_agent(agent_type, env, num_clusters, use_extra_travel_penalty, use_extra_bit,
    use_controller_dqn, use_intrinsic_timeout, use_memory, memory_size, pretrain_controller):
    if agent_type == 'dqn':
        return DqnAgent(state_dims=[2],
                            num_actions=2) # env.action_space.n
    elif agent_type == 'h_dqn':
        meta_controller_state_fn, check_subgoal_fn, num_subgoals, subgoals = get_cluster_fn(
                                                                                            n_clusters=num_clusters, 
                                                                                            extra_bit=use_extra_bit, 
                                                                                            load_from_dir=False
                                                                                            )

        return HierarchicalDqnAgent(
            state_sizes=[num_subgoals, [2]],
            agent_types=['tabular', 'network'],
            subgoals=subgoals,
            num_subgoals=num_subgoals,
            num_primitive_actions=2, # env.action_space.n
            meta_controller_state_fn=meta_controller_state_fn,
            check_subgoal_fn=check_subgoal_fn,
            use_extra_travel_penalty=use_extra_travel_penalty,
            use_extra_bit_for_subgoal_center=use_extra_bit,
            use_controller_dqn=use_controller_dqn,
            use_intrinsic_timeout=use_intrinsic_timeout,
            use_memory=use_memory,
            memory_size=memory_size,
            pretrain_controller=pretrain_controller)

def run(env_name='MountainCar-v0',
        agent_type='dqn',
        num_iterations=10000000,
        num_train_episodes=100,
        num_eval_episodes=100,
        num_clusters=5,
        logdir=None,
        experiment_dir=None,
        logfile=None,
        use_extra_travel_penalty=False,
        use_extra_bit=False,
        use_controller_dqn=False,
        use_intrinsic_timeout=False,
        use_memory=False,
        memory_size=5,
        pretrain_controller=False,
        run_number=1):
    """Function that executes RL training and evaluation.
    Args:
        env_name: Name of the environment that the agent will interact with.
        agent_type: The type RL agent that will be used for training.
        num_iterations: Number of iterations to train for.
        num_train_episodes: Number of training episodes per iteration.
        num_eval_episodes: Number of evaluation episodes per iteration.
        num_clusters: The number of clusters to use for the h-DQN unsupervised clustering.
        logdir: Directory for log file.
        logfile: File to log the agent's performance over training.
    """
    print(agent_type)
    print(num_clusters)
    print(use_extra_bit)
    experiment_dir += '_agent_type_' + agent_type + '_num_clusters_' + str(
        num_clusters) + '_use_extra_travel_penalty_' + str(
        use_extra_travel_penalty) + '_use_extra_bit_' + str(
        use_extra_bit) + '_use_controller_dqn_' + str(
        use_controller_dqn) + '_use_intrinsic_timeout_' + str(
        use_intrinsic_timeout) + '_use_memory_' + str(
        use_memory) + '_memory_size_' + str(
        memory_size) + '_pretrain_controller_' + str(
        pretrain_controller) + '_run_number_' + str(run_number)

    experiment_dir = logdir + experiment_dir
    logfile = experiment_dir + '/' + logfile

    try:
        os.stat(experiment_dir)
    except:
        os.mkdir(experiment_dir)

    env = make_environment(env_name)
    env_test = make_environment(env_name)
    # env_test = Monitor(env_test, directory='videos/', video_callable=lambda x: True, resume=True)
    print('Made environment!')
    agent = make_agent(agent_type, env, num_clusters, use_extra_travel_penalty, use_extra_bit,
        use_controller_dqn, use_intrinsic_timeout, use_memory, memory_size, pretrain_controller)
    print('Made agent!')

    for it in range(num_iterations):
        # Run train episodes.
        for train_episode in range(num_train_episodes):
            # Reset the environment.
            state = env.reset()
            state = np.expand_dims(state, axis=0)

            episode_reward = 0

            # Run the episode.
            terminal = False

            while not terminal:
                action = agent.sample(state)
                # Remove the do-nothing action.
                if action == 1:
                    env_action = 2
                else:
                    env_action = action

                next_state, reward, terminal, _ = env.step(env_action)
                next_state = np.expand_dims(next_state, axis=0)

                agent.store(state, action, reward, next_state, terminal)
                agent.update()

                episode_reward += reward
                # Update the state.
                state = next_state

        eval_rewards = []

        heat_map = np.zeros((num_clusters, num_clusters))

        # Run eval episodes.
        for eval_episode in range(num_eval_episodes):

            # Reset the environment.
            state = env_test.reset()
            # env_test.render()

            # Make sure that at test time, the agent starts near bottom of the hill.
            while state[0] < -0.6 or state[0] > -0.4:
                state = env_test.reset()
            state = np.expand_dims(state, axis=0)

            episode_reward = 0

            # Run the episode.
            terminal = False

            while not terminal:
                if agent_type == 'dqn':
                    action = agent.best_action(state)
                else:
                    action, info = agent.best_action(state)
                if agent_type == 'h_dqn' and info is not None:
                    curr_state = info[0]
                    if not use_memory:
                        curr_state = np.where(np.squeeze(curr_state) == 1)[0][0]
                    else:
                        curr_state = np.squeeze(curr_state)[-1] - 1
                    goal = info[1]
                    heat_map[curr_state][goal] += 1

                # Remove the do-nothing action.
                if action == 1:
                    env_action = 2
                else:
                    env_action = action

                next_state, reward, terminal, _ = env_test.step(env_action)

                next_state = np.expand_dims(next_state, axis=0)
                # env_test.render()
                agent.store(state, action, reward, next_state, terminal, eval=True)
                if reward > 1:
                    reward = 1 # For sake of comparison.

                episode_reward += reward

                state = next_state

            eval_rewards.append(episode_reward)

        with open(experiment_dir + '/eval_rewards_' + str(it), 'wb') as f:
            pickle.dump(eval_rewards, f)

        log(logfile, it, eval_rewards)
        if agent_type == 'h_dqn':
                plt.figure()
                plt.imshow(heat_map, cmap='hot', interpolation='nearest')
                plt.savefig(experiment_dir + '/heatmap_' + str(it) + '.png')

run(agent_type=agent_type, logdir=logdir, experiment_dir=experiment_dir,
    logfile=logfile, num_clusters=n_clusters,
    use_extra_travel_penalty=use_extra_travel_penalty, use_extra_bit=use_extra_bit,
    use_controller_dqn=use_controller_dqn, use_intrinsic_timeout=use_intrinsic_timeout,
    use_memory=use_memory, memory_size=memory_size,
    pretrain_controller=pretrain_controller, run_number=run_number)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[0. 0. 0. 1. 0. 0.]


Current State:
[[-0.48607906  0.00857885]]
Current Meta-Controller State:
[0. 0. 0. 1. 0. 0.]
Current subgoal picked:
3
Subgoal completed!
Intermediate Clusters:
[]
Intermediate Cluster Count:
defaultdict(<class 'int'>, {3: 1})
Intermediate non-beginning cluster count:
defaultdict(<class 'int'>, {})
State:
[[-0.47678101  0.00929805]]
Meta-Controller reward:
-1.01
Intrinsic reward:
1
Cluster:
[0. 0. 0. 1. 0. 0.]


Current State:
[[-0.47678101  0.00929805]]
Current Meta-Controller State:
[0. 0. 0. 1. 0. 0.]
Current subgoal picked:
3
Subgoal completed!
Intermediate Clusters:
[]
Intermediate Cluster Count:
defaultdict(<class 'int'>, {3: 1})
Intermediate non-beginning cluster count:
defaultdict(<class 'int'>, {})
State:
[[-0.46683294  0.00994807]]
Meta-Controller reward:
-1.01
Intrinsic reward:
1
Cluster:
[0. 0. 0. 1. 0. 0.]


Current State:
[[-0.46683294  0.00994807]]
Current Meta-Controller State:
[0. 0

#### make_plots.py

In [None]:
index = 4
directory = 'experiments_final/experiment_logs/Cheating_Epsilon_Decay_Faster/'

# 6 clusters
sub_dirs = ['_agent_type_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False', '_agent_type_h_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False']
# sub_dirs = ['_agent_type_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False', '_agent_type_h_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_True_memory_size_5_pretrain_controller_False']
# sub_dirs = ['_agent_type_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False', '_agent_type_h_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_True_memory_size_10_pretrain_controller_False']

# 10 clusters
# sub_dirs = ['_agent_type_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False', '_agent_type_h_dqn_num_clusters_10_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False']
# sub_dirs = ['_agent_type_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False', '_agent_type_h_dqn_num_clusters_10_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_True_memory_size_5_pretrain_controller_False']
# sub_dirs = ['_agent_type_dqn_num_clusters_6_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_False_memory_size_5_pretrain_controller_False', '_agent_type_h_dqn_num_clusters_10_use_extra_travel_penalty_False_use_extra_bit_False_use_controller_dqn_False_use_intrinsic_timeout_False_use_memory_True_memory_size_10_pretrain_controller_False']

color_index = 0
colors = ['r', 'g', 'b']
for sub_dir in sub_dirs:
    mean_rewards = {}
    train_steps = {}
    for i in range(4):
        full_dir = directory + sub_dir + '_run_number_' + str(i + 1)

        f = open(full_dir + str('/log.txt'), 'r')
        lines = f.readlines()
        mean_rewards_i = []
        train_steps_i = []
        for j in range(min(len(lines), 300)):
            line = lines[j]
            line = line.split(' ')
            mean_rewards_i.append(float(line[2]))

        mean_rewards[i] = mean_rewards_i
        # print(mean_rewards[i])
        print(len(mean_rewards_i))

        for j in range(len(mean_rewards_i)):
            with open(full_dir + '/eval_rewards_' + str(j), 'rb') as data_file:
                dump_dict = pickle.load(data_file)
                train_steps_i.append(dump_dict['train_step'])

        train_steps[i] = train_steps_i

    # interpolate means to 5000 step intervals
    interp_data = [[] for _ in range(4)]
    for task_id in range(4):
        l = 0
        for i in range(0, 700000, 5000):
            while train_steps[task_id][l+1] < i:  # step count on left <= i
                l += 1
            step_l = train_steps[task_id][l]
            step_r = train_steps[task_id][l+1]
            mean_l = np.mean(mean_rewards[task_id][l])
            mean_r = np.mean(mean_rewards[task_id][l+1])
            interp = (i - step_l) * mean_r + (step_r - i) * mean_l
            interp /= (step_r - step_l)
            interp_data[task_id].append((i, interp))

    # print(interp_data)

    means_0 = [m for (t, m) in interp_data[0]]
    means_1 = [m for (t, m) in interp_data[1]]
    means_2 = [m for (t, m) in interp_data[2]]
    means_3 = [m for (t, m) in interp_data[3]]
    steps = [t for (t, m) in interp_data[0]]
    means_arr = [[means_0[k], means_1[k], means_2[k], means_3[k]] for k in range(len(means_0))]
    means = [np.mean(means_arr[k]) for k in range(len(means_0))]

    # mean_reward_arrs = [mean_rewards[i] for i in mean_rewards]
    # means = []
    # num_to_plot = 300
    # for i in range(num_to_plot):
        # means.append((
        #    mean_rewards[0][i] + mean_rewards[1][i] + mean_rewards[2][i] + mean_rewards[3][i] + mean_rewards[4][i]) / 5.0)
        # means.append((mean_rewards[index][i]))



    # plt.scatter(np.arange(num_to_plot), means, c=colors[color_index])
    # plt.plot(steps, means, c=colors[color_index])
    plt.fill_between(
        steps, [np.percentile(r, 10) for r in means_arr], [np.percentile(
            r, 90) for r in means_arr], facecolor=colors[color_index], alpha=0.2)
    color_index += 1

plt.savefig('plot_' + str(index) + '.png')