# Planning and Learning: Prioritized Sweeping

In [1]:
import numpy as np

## Create environment

In [2]:
def create_known_environment_states():
    """Creates known environment states.

    Returns:
        num_states: int, number of states.
        num_term_states: int, number of terminal states.
        num_non_term_states: int, number of non terminal states.
    """
    num_states = 16
    num_term_states = 2
    num_non_term_states = num_states - num_term_states

    return num_states, num_term_states, num_non_term_states

In [3]:
def create_known_environment_actions(num_non_term_states):
    """Creates environment actions.

    Args:
        num_non_term_states: int, number of non terminal states.

    Returns:
        max_num_actions: int, max number of actions possible.
        num_actions_per_non_term_state: array[int], number of actions per
            non terminal state.
    """
    max_num_actions = 4

    num_actions_per_non_term_state = np.repeat(
        a=max_num_actions, repeats=num_non_term_states)

    return max_num_actions, num_actions_per_non_term_state

In [4]:
def create_known_environment():
    """Creates known environment.

    Returns:
        num_states: int, number of states.
        num_term_states: int, number of terminal states.
        num_non_term_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
        num_actions_per_non_term_state: array[int], number of actions per
            non terminal state.
    """
    (num_states,
     num_term_states,
     num_non_term_states) = create_known_environment_states()

    (max_num_actions,
     num_actions_per_non_term_state) = create_known_environment_actions(
        num_non_term_states)

    return (num_states,
            num_term_states,
            num_non_term_states,
            max_num_actions,
            num_actions_per_non_term_state)

In [5]:
class Environment:
    """Class to hold all environment properties.

    Fields:
        num_sp: array[int], number of successor states s' that can be reached
            from state s by taking action a.
        sp_idx: array[int], state indices of new state s' of taking action a
            from state s.
        p: array[float], transition probability to go from state s to s' by
            taking action a.
        r: array[float], reward from new state s' from state s by taking
            action a.
    """
    def __init__(self, num_states, num_non_term_states, max_num_actions):
        # Create environment state-action successor state arrrays
        self.num_sp = np.ones(
            shape=[num_states, max_num_actions], dtype=np.int64)

        self.sp_idx = np.reshape(
            a=np.array([1, 0, 14, 4,
                        2, 1, 0, 5,
                        2, 2, 1, 6,
                        4, 14, 3, 7,
                        5, 0, 3, 8,
                        6, 1, 4, 9,
                        6, 2, 5, 10,
                        8, 3, 7, 11,
                        9, 4, 7, 12,
                        10, 5, 8, 13,
                        10, 6, 9, 15,
                        12, 7, 11, 11,
                        13, 8, 11, 12,
                        15, 9, 12, 13],
                       dtype=np.int64),
            newshape=(num_non_term_states, max_num_actions, 1))
        self.p = np.reshape(
            a=np.repeat(
                a=1.0, repeats=num_non_term_states * max_num_actions * 1),
            newshape=(num_non_term_states, max_num_actions, 1))
        self.r = np.reshape(
            a=np.repeat(
                a=-1.0, repeats=num_non_term_states * max_num_actions * 1),
            newshape=(num_non_term_states, max_num_actions, 1))

## Create model

In [6]:
class Model:
    """Class to hold all model properties.

    Fields:
        num_seen_non_term_states: int, number of seen non-terminal states.
        seen_non_term_s_stack: array[int], stack to hold all seen non-terminal
            states.
        seen_non_term_s_stack_rev_lu: array[int], reverse lookup of stack
            that holds all seen non-terminal states.
        num_seen_non_term_s_a: array[int], number of seen non-terminal
            state-action pairs.
        seen_non_term_s_a_stack: array[int], stack to hold all seen
            non-terminal state-action pairs.
        seen_non_term_s_a_stack_rev_lu: array[int], reverse lookup of stack
            that holds all seen non-terminal states-action pairs.
        num_sp: array[int], number of successor states s' that can be reached
            from state s by taking action a.
        sp_idx: array[int], state indices of new state s' of taking action a
            from state s.
        p: array[float], transition probability to go from state s to s' by
            taking action a.
        r: array[float], reward from new state s' from state s by taking
            action a.
        s_a_ss_num_visits: array[int], number of visits to a particular
            (s, a, s') tuple.
        num_s_pred_s_a_pairs: array[int], number of state predecessor state
            action pairs.
        s_pred_s_a_pairs: dict, maps state indices to a
            list of actions.
    """
    def __init__(self, num_states, num_non_term_states, max_num_actions):
        # Create model state visit counters
        self.num_seen_non_term_states = 0
        self.seen_non_term_s_stack = np.zeros(
            shape=[num_non_term_states], dtype=np.int64)
        self.seen_non_term_s_stack_rev_lu = np.zeros(
            shape=[num_non_term_states], dtype=np.int64)

        # Create model state-action visit counters
        self.num_seen_non_term_s_a = np.zeros(
            shape=[num_non_term_states], dtype=np.int64)
        self.seen_non_term_s_a_stack = np.zeros(
            shape=[num_non_term_states, max_num_actions], dtype=np.int64)
        self.seen_non_term_s_a_stack_rev_lu = np.zeros(
            shape=[num_non_term_states, max_num_actions], dtype=np.int64)

        # Create model state-action successor state arrrays
        self.num_sp = np.zeros(
            shape=[num_states, max_num_actions], dtype=np.int64)

        self.sp_idx = np.array(
            object=[[[0] if s_idx == 0 and a_idx == 0 else []
                     for a_idx in range(0, max_num_actions)]
                    for s_idx in range(0, num_states)],
            dtype=np.object)
        self.p = np.array(
            object=[[[0.0] if s_idx == 0 and a_idx == 0 else []
                     for a_idx in range(0, max_num_actions)]
                    for s_idx in range(0, num_states)],
            dtype=np.object)
        self.r = np.array(
            object=[[[0.0] if s_idx == 0 and a_idx == 0 else []
                     for a_idx in range(0, max_num_actions)]
                    for s_idx in range(0, num_states)],
            dtype=np.object)
        self.s_a_ss_num_visits = np.array(
            object=[[[0] if s_idx == 0 and a_idx == 0 else []
                     for a_idx in range(0, max_num_actions)]
                    for s_idx in range(0, num_states)],
            dtype=np.object)

        del self.sp_idx[0, 0][0]
        del self.p[0, 0][0]
        del self.r[0, 0][0]
        del self.s_a_ss_num_visits[0, 0][0]

        self.num_s_pred_s_a_pairs = np.zeros(
            shape=[num_states], dtype=np.int64)
        self.s_pred_s_a_pairs = {
            state_index: []
            for state_index in range(0, num_states)
        }

    def update_model_seen_state_actions(self, s_idx, a_idx):
        """Updates what state and actions the model has seen.

        Args:
            s_idx: int, current state index.
            a_idx: int, current action index.
        """
        # Check to see if state has already been visited
        if (self.num_seen_non_term_states == 0 or
            (self.seen_non_term_s_stack_rev_lu[s_idx] == 0 and
             self.seen_non_term_s_stack[0] != s_idx)):  # if new state
            # Add to state stack
            # 1, 3, 2, 0, 4
            self.seen_non_term_s_stack[self.num_seen_non_term_states] = s_idx
            # 3, 0, 2, 1, 4
            num_seen = self.num_seen_non_term_states
            self.seen_non_term_s_stack_rev_lu[s_idx] = num_seen

            # Add to action stack
            # 2, 0, 3, 1
            action_idx = self.num_seen_non_term_s_a[s_idx]
            self.seen_non_term_s_a_stack[s_idx][action_idx] = a_idx
            # 1, 3, 0, 2
            lookup = self.num_seen_non_term_s_a[s_idx]
            self.seen_non_term_s_a_stack_rev_lu[s_idx][a_idx] = lookup

            # Increment counters
            self.num_seen_non_term_s_a[s_idx] += 1
            self.num_seen_non_term_states += 1
        else:  # if already visited state
            # Check to see if action has already been visited
            if (self.seen_non_term_s_a_stack_rev_lu[s_idx][a_idx] == 0 and
                    self.seen_non_term_s_a_stack[s_idx][0] != a_idx):
                # Add to action stack
                # 2, 0, 3, 1
                action_idx = self.num_seen_non_term_s_a[s_idx]
                self.seen_non_term_s_a_stack[s_idx][action_idx] = a_idx
                # 1, 3, 0, 2
                num_seen = self.num_seen_non_term_s_a[s_idx]
                self.seen_non_term_s_a_stack_rev_lu[s_idx][a_idx] = num_seen

                # Increment counters
                self.num_seen_non_term_s_a[s_idx] += 1

    def update_model_of_environment_from_experience(
            self, s_idx, a_idx, reward, next_s_idx):
        """Updates the model from environment experience.

        Args:
            s_idx: int, current state index.
            a_idx: int, current action index.
            reward: float, reward of taking action a_idx in state s_idx.
            next_s_idx: int, next state index.
        """
        # Update model successor arrays
        if next_s_idx in self.sp_idx[s_idx, a_idx]:
            self.suc_idx = self.sp_idx[s_idx, a_idx].index(next_s_idx)
            self.s_a_ss_num_visits[s_idx, a_idx][self.suc_idx] += 1
        else:
            self.num_sp[s_idx, a_idx] += 1
            self.sp_idx[s_idx, a_idx].append(next_s_idx)
            self.r[s_idx, a_idx].append(reward)
            self.s_a_ss_num_visits[s_idx, a_idx].append(1)

        self.s_a_ss_num_visits_sum = np.sum(
            a=np.asarray(a=self.s_a_ss_num_visits[s_idx, a_idx]))
        self.p[s_idx, a_idx] = [
            float(self.s_a_ss_num_visits[s_idx, a_idx][suc_idx]) /
            self.s_a_ss_num_visits_sum
            for suc_idx in range(0, self.num_sp[s_idx, a_idx])
        ]

        # Update model state predecessors
        if (s_idx, a_idx) not in self.s_pred_s_a_pairs[next_s_idx]:
            self.s_pred_s_a_pairs[next_s_idx].append((s_idx, a_idx))

            self.num_s_pred_s_a_pairs[next_s_idx] += 1

    def model_simulate_planning(
            self,
            num_planning_steps,
            num_non_term_states,
            max_num_actions,
            alpha,
            gamma,
            theta,
            q,
            priority_queue):
        """Uses model to simulate experience and plan best actions.

        Args:
            num_planning_steps: int, number of steps for the planning stage.
            num_non_term_states: int, number of non terminal states.
            max_num_actions: int, max number of actions possible.
            alpha: float, alpha > 0, learning rate.
            gamma: float, 0 <= gamma <= 1, amount to discount future reward.
            theta: float, small threshold for adding state-action pairs to
                priority queue.
            q: array[float], keeps track of the estimated value of each
                state-action pair Q(s, a).
            priority_queue: instance of `PriorityQueue` class, an array of
                `PriorityQueueNode`s that keep track of the state index,
                action index, and priority of state-action pairs.
        Returns:
            q: array[float], keeps track of the estimated value of each
                state-action pair Q(s, a).
            priority_queue: instance of `PriorityQueue` class, an array of
                `PriorityQueueNode`s that keep track of the state index,
                action index, and priority of state-action pairs.
        """
        for i in range(0, num_planning_steps):
            # Check if priority queue is empty
            if priority_queue.cur_p_q_size == 0:
                break  # break i loop since priority queue is empty

            # Get max priority state-action pair from queue
            s_idx, a_idx = priority_queue.pop_max_node_from_p_q()

            # Get reward
            reward, sst_idx = observe_reward(s_idx, a_idx, self)

            # Get next state
            next_s_idx = self.sp_idx[s_idx, a_idx][sst_idx]

            # Check to see if we actioned into a terminal state
            if next_s_idx >= num_non_term_states:
                q[s_idx, a_idx] += alpha * (reward - q[s_idx, a_idx])
            else:
                # Get next action, max action of next state
                next_a_idx = select_max_q_action(
                    next_s_idx, max_num_actions, q)

                # Calculate state-action-function using quintuple
                # SARSargmax(a,Q)
                delta = gamma * q[next_s_idx, next_a_idx] - q[s_idx, a_idx]
                q[s_idx, a_idx] += alpha * (reward + delta)

            # Loop for all predicted Sbar and Abar to lead to S
            for j in range(0, self.num_s_pred_s_a_pairs[s_idx]):
                pred_s_idx = self.s_pred_s_a_pairs[s_idx][j][0]
                pred_a_idx = self.s_pred_s_a_pairs[s_idx][j][1]

                # Get reward
                if s_idx in self.sp_idx[pred_s_idx, pred_a_idx]:
                    sst_idx = self.sp_idx[pred_s_idx, pred_a_idx].index(s_idx)

                # Get reward from predecessor state and action
                reward = self.r[s_idx, a_idx][sst_idx]

                # Get next action, max action of next state
                next_a_idx = select_max_q_action(s_idx, max_num_actions, q)

                # Calculate priority
                expected = gamma * q[s_idx, next_a_idx]
                delta = expected - q[pred_s_idx, pred_a_idx]
                priority = np.abs(reward + delta)

                # Check if priority is over threshold to add to priority queue
                if priority > theta:
                    priority_queue.search_and_update_p_q(
                        pred_s_idx, pred_a_idx, priority)

        return q, priority_queue

## Create priority queue

In [7]:
class PriorityQueueNode:
    """Class to create nodes of a priority queue.

    Fields:
        s_idx: int, state index.
        a_idx: int, action index.
        priority: float, priority of state-action pair node.
    """
    def __init__(self, i):
        # Create environment state-action successor state arrrays
        self.s_idx = -i
        self.a_idx = i
        self.priority = np.finfo(float).min

class PriorityQueue:
    """Class to create a priority queue.

    Fields:
        p_q: array, priority queue that contains num_non_term_states *
            max_num_actions `PriorityQueueNode`s
        cur_p_q_size: int, current number of active nodes in priority queue.
    """
    def __init__(self, num_non_term_states, max_num_actions):
        self.p_q = np.empty(
            shape=[num_non_term_states * max_num_actions], dtype=object)
        for i in range(0, num_non_term_states * max_num_actions):
            self.p_q[i] = PriorityQueueNode(i)
        self.p_q[0].priority = np.finfo(float).max

        self.cur_p_q_size = 0

    def search_and_update_p_q(self, s_idx, a_idx, priority):
        """Searches for and updates a node in the priority queue.

        Args:
            s_idx: int, state index.
            a_idx: int, action index.
            priority: float, priority of state-action pair node.
        Returns:
            q: array[float], keeps track of the estimated value of each
                state-action pair Q(s, a).
        """
        p_q_idx = -1
        p_q_idx = self.search_p_q(s_idx, a_idx)

        # Check if node was found
        if p_q_idx >= 0:
            # Check if found node has a lower priority saved than new priority
            if self.p_q[p_q_idx].priority < priority:
                self.p_q_node_increase_priority(p_q_idx, priority)
        else:
            # Node wasn't found so insert into priority queue
            self.insert_into_p_q(s_idx, a_idx, priority)

    def search_p_q(self, s_idx, a_idx):
        """Searches for a node in the priority queue.

        Args:
            s_idx: int, state index.
            a_idx: int, action index.
        Returns:
            p_q_idx: int, index of priority queue node.
        """
        p_q_idx = -1

        # Search up to all nodes in worst case
        for i in range(0, self.cur_p_q_size):
            if (self.p_q[i].s_idx == s_idx and self.p_q[i].a_idx == a_idx):
                p_q_idx = i
                break  # break i loop since we found node

        return p_q_idx

    def p_q_node_increase_priority(self, p_q_idx, new_priority):
        """Increases priority of a node in the priority queue.

        Increases priority at p_q_idx to new_priority, where it is assumed
        that new_priority is greater than priority_queue[p_q_idx].

        Args:
            p_q_idx: int, index of priority queue node.
            new_priority: float, new priority of state-action pair node.
        """
        self.p_q[p_q_idx].priority = new_priority

        while (p_q_idx != 0 and
               self.p_q[self.get_par_idx(p_q_idx)].priority <
               self.p_q[p_q_idx].priority):
            (self.p_q[p_q_idx],
             self.p_q[self.get_par_idx(p_q_idx)]) = self.swap_p_q_nodes(
                self.p_q[p_q_idx],
                self.p_q[self.get_par_idx(p_q_idx)])
            p_q_idx = self.get_par_idx(p_q_idx)

    def insert_into_p_q(self, s_idx, a_idx, priority):
        """Inserts a node into the priority queue.

        Args:
            s_idx: int, state index.
            a_idx: int, action index.
            priority: float, priority of state-action pair node.
        """
        # First insert the new node at the end
        self.cur_p_q_size += 1
        p_q_idx = self.cur_p_q_size - 1

        self.p_q[p_q_idx].s_idx = s_idx
        self.p_q[p_q_idx].a_idx = a_idx
        self.p_q[p_q_idx].priority = priority

        # Fix the max heap property if it is violated
        while (p_q_idx != 0 and
               self.p_q[self.get_par_idx(p_q_idx)].priority <
               self.p_q[p_q_idx].priority):
            self.p_q[p_q_idx], self.p_q[self.get_par_idx(p_q_idx)] = \
                self.swap_p_q_nodes(
                    self.p_q[p_q_idx], self.p_q[self.get_par_idx(p_q_idx)])
            self.p_q_idx = self.get_par_idx(p_q_idx)

    def pop_max_node_from_p_q(self):
        """Pops max node off from priority queue.

        Returns:
            s_idx: int, state index.
            a_idx: int, action index.
        """
        if self.cur_p_q_size == 1:
            self.cur_p_q_size -= 1
            return self.p_q[0].s_idx, self.p_q[0].a_idx

        # Store the maximum value, and remove it from heap
        s_idx = self.p_q[0].s_idx
        a_idx = self.p_q[0].a_idx

        self.p_q[0].s_idx = self.p_q[self.cur_p_q_size - 1].s_idx
        self.p_q[0].a_idx = self.p_q[self.cur_p_q_size - 1].a_idx
        self.p_q[0].priority = self.p_q[self.cur_p_q_size - 1].priority
        self.cur_p_q_size -= 1

        # Fix the max heap property if it is violated
        self.max_heapify_p_q(0)

        return s_idx, a_idx

    def max_heapify_p_q(self, p_q_idx):
        """Max heapifies a subtree of priority queue.

        Recursively heapifies a subtree with the root at given index, however
            assumes that the subtrees are already heapified.

        Args:
            p_q_idx: int, index of priority queue node.
        """
        l = self.get_left_idx(p_q_idx)
        r = self.get_right_idx(p_q_idx)
        biggest = p_q_idx

        if (l < self.cur_p_q_size and
                self.p_q[l].priority > self.p_q[p_q_idx].priority):
            biggest = l

        if (r < self.cur_p_q_size and
                self.p_q[r].priority > self.p_q[biggest].priority):
            biggest = r

        if biggest != p_q_idx:
            temp_s_idx = self.p_q[p_q_idx].s_idx
            temp_a_idx = self.p_q[p_q_idx].a_idx
            temp_priority = self.p_q[p_q_idx].priority

            self.p_q[p_q_idx].s_idx = self.p_q[biggest].s_idx
            self.p_q[p_q_idx].a_idx = self.p_q[biggest].a_idx
            self.p_q[p_q_idx].priority = self.p_q[biggest].priority

            self.p_q[biggest].s_idx = temp_s_idx
            self.p_q[biggest].a_idx = temp_a_idx
            self.p_q[biggest].priority = temp_priority

            self.max_heapify_p_q(biggest)

    def swap_p_q_nodes(self, x, y):
        """Swaps attributes between two `PriorityQueueNode`s.

        Args:
            x: instance of `PriorityQueueNode`.
            y: instance of `PriorityQueueNode`.
        Returns:
            x: instance of `PriorityQueueNode`.
            y: instance of `PriorityQueueNode`.
        """
        temp_s_idx = x.s_idx
        temp_a_idx = x.a_idx
        temp_priority = x.priority

        x.s_idx = y.s_idx
        x.a_idx = y.a_idx
        x.priority = y.priority

        y.s_idx = temp_s_idx
        y.a_idx = temp_a_idx
        y.priority = temp_priority

        return x, y

    def get_par_idx(self, p_q_idx):
        """Gets the parent index of given priority queue node's index.

        Args:
            p_q_idx: int, index of priority queue node.
        """
        return (p_q_idx - 1) // 2

    def get_left_idx(self, p_q_idx):
        """Gets the left child index of given priority queue node's index.

        Args:
            p_q_idx: int, index of priority queue node.
        """
        return (2 * p_q_idx + 1)

    def get_right_idx(self, p_q_idx):
        """Gets the right child index of given priority queue node's index.

        Args:
            p_q_idx: int, index of priority queue node.
        """
        return (2 * p_q_idx + 2)

## Set hyperparameters

In [8]:
def set_hyperparameters():
    """Sets hyperparameters.

    Returns:
        num_episodes: int, number of episodes to train over.
        maximum_episode_length: int, max number of timesteps for an episode.
        num_planning_steps: int, number of steps for the planning stage.
        alpha: float, alpha > 0, learning rate.
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        gamma: float, 0 <= gamma <= 1, amount to discount future reward.
        theta: float, small threshold for adding state-action pairs to priority
            queue.
    """
    num_episodes = 10000
    maximum_episode_length = 200
    num_planning_steps = 1
    alpha = 0.1
    epsilon = 0.1
    gamma = 1.0
    theta = 0.0

    return (num_episodes,
            maximum_episode_length,
            num_planning_steps,
            alpha,
            epsilon,
            gamma,
            theta)

## Create value function and policy arrays

In [9]:
def create_value_function_arrays(num_states, max_num_actions):
    """Creates value function arrays.

    Args:
        num_states: int, number of states.
        max_num_actions: int, max number of actions possible.
    Returns:
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
    """
    return np.zeros(shape=[num_states, max_num_actions], dtype=np.float64)

In [10]:
def create_policy_arrays(num_non_term_states, max_num_actions):
    """Creates policy arrays.

    Args:
        num_non_term_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
    Returns:
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
    """
    policy = np.repeat(
        a=1.0 / max_num_actions,
        repeats=num_non_term_states * max_num_actions)

    policy = np.reshape(
        a=policy,
        newshape=(num_non_term_states, max_num_actions))

    return policy

## Create algorithm

In [11]:
# Set random seed so that everything is reproducible
np.random.seed(seed=0)

In [12]:
def initialize_epsiode(num_non_term_states):
    """Initializes epsiode with initial state and initial action.

    Args:
        num_non_term_states: int, number of non terminal states.
    Returns:
        init_s_idx: int, initial state index from set of non terminal states.
    """
    # Randomly choose an initial state from all non-terminal states
    init_s_idx = np.random.randint(
        low=0, high=num_non_term_states, dtype=np.int64)

    return init_s_idx

In [13]:
def epsilon_greedy_policy_from_state_action_function(
        max_num_actions, q, epsilon, s_idx, policy):
    """Create epsilon-greedy policy from state-action value function.

    Args:
        max_num_actions: int, max number of actions possible.
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        s_idx: int, current state index.
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
    Returns:
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
    """
    # Save max state-action value and find the number of actions that have the
    # same max state-action value
    max_action_value = np.max(a=q[s_idx, :])
    max_action_count = np.count_nonzero(a=q[s_idx, :] == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs
    # that have the same value and zero otherwise
    if max_action_count == max_num_actions:
        max_policy_prob_per_action = 1.0 / max_action_count
        remain_prob_per_action = 0.0
    else:
        max_policy_prob_per_action = (1.0 - epsilon) / max_action_count
        remain_prob_per_action = epsilon / (max_num_actions - max_action_count)

    policy[s_idx, :] = np.where(
        q[s_idx, :] == max_action_value,
        max_policy_prob_per_action,
        remain_prob_per_action)

    return policy

In [14]:
def loop_through_episode(
        num_non_term_states,
        max_num_actions,
        environment,
        model,
        priority_queue,
        q,
        policy,
        alpha,
        epsilon,
        gamma,
        theta,
        maximum_episode_length,
        num_planning_steps,
        s_idx):
    """Loops through episode to iteratively update policy.

    Args:
        num_non_term_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
        environment: instance of `Environment` class that holds environment
            properties that are hidden from us, but that we can sample.
        model: instance of `Model` class that holds model properties
            that we learn through experience.
        priority_queue: instance of `PriorityQueue` class, an array of
            `PriorityQueueNode`s that keep track of the state index,
            action index, and priority of state-action pairs.
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
        alpha: float, alpha > 0, learning rate.
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        gamma: float, 0 <= gamma <= 1, amount to discount future reward.
        theta: float, small threshold for adding state-action pairs to priority
            queue.
        maximum_episode_length: int, max number of timesteps for an episode.
        num_planning_steps: int, number of steps for the planning stage.
        s_idx: int, current state index.
    Returns:
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
        model: instance of `Model` class that holds model properties
            that we learn through experience.
        priority_queue: instance of `PriorityQueue` class, an array of
            `PriorityQueueNode`s that keep track of the state index,
            action index, and priority of state-action pairs.
    """
    # Loop through episode steps until termination
    for t in range(0, maximum_episode_length):
        # Get epsilon-greedy action
        a_idx, policy = select_action_from_epsilon_greedy_policy(
            max_num_actions, q, epsilon, s_idx, policy)

        # Update what state and actions the model has seen
        model.update_model_seen_state_actions(s_idx, a_idx)

        # Get reward
        reward, sst_idx = observe_reward(s_idx, a_idx, environment)

        # Get next state
        next_s_idx = environment.sp_idx[s_idx, a_idx, sst_idx]

        # Update model from environment experience
        model.update_model_of_environment_from_experience(
            s_idx, a_idx, reward, next_s_idx)

        # Check to see if we actioned into a terminal state
        if next_s_idx >= num_non_term_states:
            # Calculate priority
            priority = np.abs(reward - q[s_idx, a_idx])
        else:
            # Get next action, max action of next state
            next_a_idx = select_max_q_action(next_s_idx, max_num_actions, q)

            # Calculate priority
            delta = gamma * q[next_s_idx][next_a_idx] - q[s_idx][a_idx]
            priority = np.abs(reward + delta)

        # Check if priority is over threshold to add to priority queue
        if priority > theta:
            priority_queue.search_and_update_p_q(
                s_idx, a_idx, priority)

        # Use updated model to simulate experience in planning phase
        q, priority_queue = model.model_simulate_planning(
            num_planning_steps,
            num_non_term_states,
            max_num_actions,
            alpha,
            gamma,
            theta,
            q,
            priority_queue)

        # Check to see if we actioned into a terminal state
        if next_s_idx >= num_non_term_states:
            break  # break i loop

        # Update state to next state
        s_idx = next_s_idx

    return q, policy, model, priority_queue

In [15]:
def select_action_from_epsilon_greedy_policy(
        max_num_actions, q, epsilon, s_idx, policy):
    """Selects an action in state s_idx from epsilon-greedy policy.

    Args:
        max_num_actions: int, max number of actions possible.
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        s_idx: int, current state index.
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
    Returns:
        a_idx: int, current action index.
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
    """
    # Choose policy for chosen state by epsilon-greedy choosing from the
    # state-action-value function
    policy = epsilon_greedy_policy_from_state_action_function(
        max_num_actions, q, epsilon, s_idx, policy)

    # Get epsilon-greedy action
    a_idx = np.random.choice(a=max_num_actions, p=policy[s_idx, :])

    return a_idx, policy

In [16]:
def observe_reward(s_idx, a_idx, system):
    """Observes the reward from the given system (environment or model).

    Args:
        s_idx: int, current state index.
        a_idx: int, current action index.
        system: either an instance of `Environment` or `Model` class that
            holds environment or model properties, respectively.
    Returns:
        reward: float, reward of taking action a_idx in state s_idx.
        sst_idx: int, successor state transition index.
    """
    sst_idx = np.random.choice(
        a=system.num_sp[s_idx, a_idx],
        p=system.p[s_idx, a_idx][:])

    reward = system.r[s_idx, a_idx][sst_idx]

    return reward, sst_idx

In [17]:
def select_max_q_action(s_idx, max_num_actions, q):
    """Selects action with max state-action-value function for given state.

    Args:
        s_idx: int, current state index.
        max_num_actions: int, max number of actions possible.
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
    Returns:
        next_a_idx: int, next action index.
    """
    max_action_value = np.max(a=q[s_idx, :])
    max_action_stack = np.extract(
        condition=q[s_idx, :] == max_action_value,
        arr=np.arange(max_num_actions))

    next_a_idx = np.random.choice(a=max_action_stack)

    return next_a_idx

In [18]:
def off_policy_planning_and_learning_prioritized_sweeping(
        num_non_term_states,
        max_num_actions,
        environment,
        model,
        priority_queue,
        q,
        policy,
        alpha,
        epsilon,
        gamma,
        theta,
        num_episodes,
        maximum_episode_length,
        num_planning_steps):
    """Loops through episodes to iteratively update policy.

    Args:
        num_non_term_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
        environment: instance of `Environment` class that holds environment
            properties that are hidden from us, but that we can sample.
        model: instance of `Model` class that holds model properties
            that we learn through experience.
        priority_queue: instance of `PriorityQueue` class, an array of
            `PriorityQueueNode`s that keep track of the state index,
            action index, and priority of state-action pairs.
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
        alpha: float, alpha > 0, learning rate.
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        gamma: float, 0 <= gamma <= 1, amount to discount future reward.
        theta: float, small threshold for adding state-action pairs to priority
            queue.
        num_episodes: int, number of episodes to train over.
        maximum_episode_length: int, max number of timesteps for an episode.
        num_planning_steps: int, number of steps for the planning stage.
        s_idx: int, current state index.
    Returns:
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
        model: instance of `Model` class that holds model properties
            that we learn through experience.
        priority_queue: instance of `PriorityQueue` class, an array of
            `PriorityQueueNode`s that keep track of the state index,
            action index, and priority of state-action pairs.
    """
    for episode in range(0, num_episodes):
        # Initialize episode to get initial state
        init_s_idx = initialize_epsiode(num_non_term_states)

        # Loop through episode and update the policy
        q, policy, model, priority_queue = loop_through_episode(
            num_non_term_states,
            max_num_actions,
            environment,
            model,
            priority_queue,
            q,
            policy,
            alpha,
            epsilon,
            gamma,
            theta,
            maximum_episode_length,
            num_planning_steps,
            init_s_idx)

    return q, policy, model, priority_queue

## Run algorithm

In [19]:
def run_algorithm():
    """Runs the algorithm.

    Returns:
        model: instance of `Model` class that holds model properties
            that we learn through experience.
    """
    (num_states,
     _,
     num_non_term_states,
     max_num_actions,
     _) = create_known_environment()

    environment = Environment(
        num_states, num_non_term_states, max_num_actions)
    model = Model(num_states, num_non_term_states, max_num_actions)
    priority_queue = PriorityQueue(num_non_term_states, max_num_actions)

    (num_episodes,
     maximum_episode_length,
     num_planning_steps,
     alpha,
     epsilon,
     gamma,
     theta) = set_hyperparameters()

    q = create_value_function_arrays(num_states, max_num_actions)

    policy = create_policy_arrays(num_non_term_states, max_num_actions)

    # Print initial arrays
    print("\nInitial state-action value function")
    print(q)

    print("\nInitial policy")
    print(policy)

    # Run off policy planning and learning prioritized sweeping
    (q,
     policy,
     model,
     priority_queue) = off_policy_planning_and_learning_prioritized_sweeping(
        num_non_term_states,
        max_num_actions,
        environment,
        model,
        priority_queue,
        q,
        policy,
        alpha,
        epsilon,
        gamma,
        theta,
        num_episodes,
        maximum_episode_length,
        num_planning_steps)

    # Print final results
    print("\nFinal state-action value function")
    print(q)

    print("\nFinal policy")
    print(policy)

    return model

In [20]:
model = run_algorithm()


Initial state-action value function
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function
[[-2.99999917 -1.99999943 -0.99999971 -2.99999917]
 [-3.99999891 -2.99999917 -1.99999943 -3.99999891]
 [-3.99999891 -3.99999891 -2.99999917 -2.9999992 ]
 [-2.99999917 -0.99999971 -1.99999943 -2.99999917]
 [-3.99999891 -1.99999946 -1.99999943 -3.99999891]
 [-2.9999992  -2.99999917 -2.99999917 -2.99999917]
 [-2.9999992  -3.99999891 -3.99999891 -1.9

In [21]:
# Print model seen arrays
print("model.num_seen_non_term_states")
print(model.num_seen_non_term_states)
print("model.seen_non_term_s_stack")
print(model.seen_non_term_s_stack)
print("model.seen_non_term_s_stack_rev_lu")
print(model.seen_non_term_s_stack_rev_lu)
print("model.num_seen_non_term_s_a")
print(model.num_seen_non_term_s_a)
print("model.seen_non_term_s_a_stack")
print(model.seen_non_term_s_a_stack)
print("model.seen_non_term_s_a_stack_rev_lu")
print(model.seen_non_term_s_a_stack_rev_lu)

model.num_seen_non_term_states
14
model.seen_non_term_s_stack
[12 11  7  3  8  4  5  6  9 13  0  1  2 10]
model.seen_non_term_s_stack_rev_lu
[10 11 12  3  5  6  7  2  4  8 13  1  0  9]
model.num_seen_non_term_s_a
[4 4 4 4 4 4 4 4 4 4 4 4 4 4]
model.seen_non_term_s_a_stack
[[0 1 2 3]
 [0 3 1 2]
 [3 1 0 2]
 [3 2 0 1]
 [2 0 1 3]
 [0 3 1 2]
 [0 2 3 1]
 [1 3 0 2]
 [1 0 2 3]
 [3 2 0 1]
 [3 2 0 1]
 [2 1 3 0]
 [2 1 3 0]
 [2 3 1 0]]
model.seen_non_term_s_a_stack_rev_lu
[[0 1 2 3]
 [0 2 3 1]
 [2 1 3 0]
 [2 3 1 0]
 [1 2 0 3]
 [0 2 3 1]
 [0 3 1 2]
 [2 0 3 1]
 [1 0 2 3]
 [2 3 1 0]
 [2 3 1 0]
 [3 1 0 2]
 [3 1 0 2]
 [3 2 0 1]]


In [22]:
# Print model successor arrays
print("model.num_sp")
print(model.num_sp)
print("model.sp_idx")
print(model.sp_idx)
print("model.p")
print(model.p)
print("model.r")
print(model.r)
print("model.s_a_ss_num_visits")
print(model.s_a_ss_num_visits)

model.num_sp
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [0 0 0 0]
 [0 0 0 0]]
model.sp_idx
[[list([1]) list([0]) list([14]) list([4])]
 [list([2]) list([1]) list([0]) list([5])]
 [list([2]) list([2]) list([1]) list([6])]
 [list([4]) list([14]) list([3]) list([7])]
 [list([5]) list([0]) list([3]) list([8])]
 [list([6]) list([1]) list([4]) list([9])]
 [list([6]) list([2]) list([5]) list([10])]
 [list([8]) list([3]) list([7]) list([11])]
 [list([9]) list([4]) list([7]) list([12])]
 [list([10]) list([5]) list([8]) list([13])]
 [list([10]) list([6]) list([9]) list([15])]
 [list([12]) list([7]) list([11]) list([11])]
 [list([13]) list([8]) list([11]) list([12])]
 [list([15]) list([9]) list([12]) list([13])]
 [list([]) list([]) list([]) list([])]
 [list([]) list([]) list([]) list([])]]
model.p
[[list([1.0]) list([1.0]) list([1.0]) list([1.0])]
 [list([1.0]) list([1.0]) list([1.0]) l