In [54]:
import numpy as np

# Create environment

In [55]:
number_of_states = 16
number_of_terminal_states = 2
number_of_non_terminal_states = number_of_states - number_of_terminal_states

In [56]:
max_number_of_actions = 4

In [57]:
number_of_actions_per_non_terminal_state = np.repeat(a = max_number_of_actions, repeats = number_of_non_terminal_states)

In [58]:
# Create class to hold all environment properties in
class Environment:
    def __init__(self, number_of_states, number_of_non_terminal_states, max_number_of_actions):
        # Create environment state-action successor state arrrays
        self.number_of_state_action_successor_states = np.ones(shape = [number_of_states, max_number_of_actions], dtype = np.int64)

        self.state_action_successor_state_indices = np.reshape(a= np.array([1, 0, 14, 4, 2, 1, 0, 5, 2, 2, 1, 6, 4, 14, 3, 7, 5, 0, 3, 8, 6, 1, 4, 9, 6, 2, 5, 10, 8, 3, 7, 11, 9, 4, 7, 12, 10, 5, 8, 13, 10, 6, 9, 15, 12, 7, 11, 11, 13, 8, 11, 12, 15, 9, 12, 13], dtype = np.int64), newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
        self.state_action_successor_state_transition_probabilities = np.reshape(a = np.repeat(a = 1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1), newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
        self.state_action_successor_state_rewards = np.reshape(a = np.repeat(a = -1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1), newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
        
environment = Environment(number_of_states, number_of_non_terminal_states, max_number_of_actions)

# Create model

In [59]:
# Create class to hold all model properties in
class Model:
    def __init__(self, number_of_states, number_of_non_terminal_states, max_number_of_actions):
        # Create model state visit counters
        self.number_of_seen_non_terminal_states = 0
        self.seen_non_terminal_states_stack = np.zeros(shape = [number_of_non_terminal_states], dtype = np.int64)
        self.seen_non_terminal_states_stack_reverse_lookup = np.zeros(shape = [number_of_non_terminal_states], dtype = np.int64)
        
        # Create model state-action visit counters
        self.number_of_seen_non_terminal_states_actions = np.zeros(shape = [number_of_non_terminal_states], dtype = np.int64)
        self.seen_non_terminal_states_actions_stack = np.zeros(shape = [number_of_non_terminal_states, max_number_of_actions], dtype = np.int64)
        self.seen_non_terminal_states_actions_stack_reverse_lookup = np.zeros(shape = [number_of_non_terminal_states, max_number_of_actions], dtype = np.int64)
        self.state_action_time_since_last_visit = np.zeros(shape = [number_of_non_terminal_states, max_number_of_actions], dtype = np.int64)
        
        # Create model state-action successor state arrrays
        self.number_of_state_action_successor_states = np.zeros(shape = [number_of_states, max_number_of_actions], dtype = np.int64)

        self.state_action_successor_state_indices = np.array(object = [[[0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        self.state_action_successor_state_transition_probabilities = np.array(object = [[[0.0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        self.state_action_successor_state_rewards = np.array(object = [[[0.0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        self.state_action_successor_state_number_of_visits = np.array(object = [[[0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        del self.state_action_successor_state_indices[0, 0][0]
        del self.state_action_successor_state_transition_probabilities[0, 0][0]
        del self.state_action_successor_state_rewards[0, 0][0]
        del self.state_action_successor_state_number_of_visits[0, 0][0]
        
        self.number_of_state_predecessor_state_action_pairs = np.zeros(shape = [number_of_states], dtype = np.int64)
        self.state_predecessor_state_action_pairs = {state_index: [] for state_index in range(0, number_of_states)}
        
model = Model(number_of_states, number_of_non_terminal_states, max_number_of_actions)

# Create priority queue

In [60]:
# Create class to hold all environment properties in
class PriorityQueueNode:
    def __init__(self, i):
        # Create environment state-action successor state arrrays
        self.state_index = -i
        self.action_index = i
        self.priority = np.finfo(float).min

priority_queue = np.empty(shape = [number_of_non_terminal_states * max_number_of_actions], dtype = object)
for i in range(0, number_of_non_terminal_states * max_number_of_actions):
    priority_queue[i] = PriorityQueueNode(i)
priority_queue[0].priority = np.finfo(float).max

current_priority_queue_size = 0

# Set hyperparameters

In [61]:
# Set the number of episodes
number_of_episodes = 10000
# Set the maximum episode length
maximum_episode_length = 200
# Set the number of steps for the planning stage
number_of_planning_steps = 1
# Set learning rate alpha
alpha = 0.1
# Set epsilon for our epsilon level of exploration
epsilon = 0.1
# Set discounting factor gamma
discounting_factor_gamma = 1.0
# Set small threshold for adding state-action pairs to priority queue
theta = 0.0

# Create value function and policy arrays

In [62]:
state_action_value_function = np.repeat(a = 0.0, repeats = number_of_states * max_number_of_actions)
state_action_value_function = np.reshape(a = state_action_value_function, newshape = (number_of_states, max_number_of_actions))

In [63]:
policy = np.repeat(a = 1.0 / max_number_of_actions, repeats = number_of_non_terminal_states * max_number_of_actions)
policy = np.reshape(a = policy, newshape = (number_of_non_terminal_states, max_number_of_actions))

# Create algorithm

In [64]:
# Set random seed so that everything is reproducible
np.random.seed(seed = 0)

In [65]:
# This function initializes episodes
def initialize_epsiode(number_of_non_terminal_states):
    # Initial state
    initial_state_index = np.random.randint(low = 0, high = number_of_non_terminal_states, dtype = np.int64) # randomly choose an initial state from all non-terminal states

    return initial_state_index

In [66]:
# This function selects a policy greedily from the state-action-value function
def epsilon_greedy_policy_from_state_action_function(max_number_of_actions, state_action_value_function, epsilon, state_index, policy):
    # Save max state-action value and find the number of actions that have the same max state-action value
    max_action_value = np.max(a = state_action_value_function[state_index, :])
    max_action_count = np.count_nonzero(a = state_action_value_function[state_index, :] == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs that have the same value and zero otherwise
    if max_action_count == max_number_of_actions:
        max_policy_apportioned_probability_per_action = 1.0 / max_action_count
        remaining_apportioned_probability_per_action = 0.0
    else:
        max_policy_apportioned_probability_per_action = (1.0 - epsilon) / max_action_count
        remaining_apportioned_probability_per_action = epsilon / (max_number_of_actions - max_action_count)

    policy[state_index, :] = np.where(state_action_value_function[state_index, :] == max_action_value, max_policy_apportioned_probability_per_action, remaining_apportioned_probability_per_action)

    return policy

In [67]:
# This function loops through episodes and updates the policy
def loop_through_episode(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, theta, maximum_episode_length, number_of_planning_steps, state_index, current_priority_queue_size, priority_queue):
    # Loop through episode steps until termination
    for t in range(0, maximum_episode_length):
        # Get epsilon-greedy action
        action_index, policy = select_action_from_epsilon_greedy_policy(max_number_of_actions, state_action_value_function, epsilon, state_index, policy)
        
        # Update what state and actions the model has seen
        model = update_model_seen_state_actions(state_index, action_index, model)
        
        # Get reward
        reward, successor_state_transition_index = observe_reward(state_index, action_index, environment)

        # Get next state
        next_state_index = environment.state_action_successor_state_indices[state_index, action_index, successor_state_transition_index]
        
        # Update model from environment experience
        model = update_model_of_environment_from_experience(state_index, action_index, reward, next_state_index, model)

        # Check to see if we actioned into a terminal state
        if next_state_index >= number_of_non_terminal_states:
            # Calculate priority
            priority = np.abs(reward - state_action_value_function[state_index, action_index])
        else:
            # Get next action, max action of next state
            next_action_index = select_max_state_action_value_function_action(next_state_index, max_number_of_actions, state_action_value_function)

            # Calculate priority
            priority = np.abs(reward + discounting_factor_gamma * state_action_value_function[next_state_index][next_action_index] - state_action_value_function[state_index][action_index])
            
        # Check if priority is over threshold to add to priority queue
        if priority > theta:
            priority_queue, current_priority_queue_size = search_and_update_priority_queue(state_index, action_index, priority, current_priority_queue_size, priority_queue)

        # Use updated model to simulate experience in planning phase
        state_action_value_function, priority_queue, current_priority_queue_size = model_simualate_planning(number_of_planning_steps, number_of_non_terminal_states, max_number_of_actions, model, alpha, discounting_factor_gamma, theta, state_action_value_function, current_priority_queue_size, priority_queue)
        
        # Check to see if we actioned into a terminal state
        if next_state_index >= number_of_non_terminal_states:
            break # break i loop, episode terminated since we ended up in a terminal state
            
        # Update state to next state
        state_index = next_state_index

    return state_action_value_function, policy, model

In [68]:
# This function selects an action in state state_index from epsilon-greedy policy
def select_action_from_epsilon_greedy_policy(max_number_of_actions, state_action_value_function, epsilon, state_index, policy):
    # Choose policy for chosen state by epsilon-greedy choosing from the state-action-value function
    policy = epsilon_greedy_policy_from_state_action_function(max_number_of_actions, state_action_value_function, epsilon, state_index, policy)

    # Get epsilon-greedy action
    action_index = np.random.choice(a = max_number_of_actions, p = policy[state_index, :])

    return action_index, policy

In [69]:
# This function observes the reward from the environment by taking action action_index in state state_index
def observe_reward(state_index, action_index, system):
    successor_state_transition_index = np.random.choice(a = system.number_of_state_action_successor_states[state_index, action_index], p = system.state_action_successor_state_transition_probabilities[state_index, action_index][:])

    reward = system.state_action_successor_state_rewards[state_index, action_index][successor_state_transition_index]

    return reward, successor_state_transition_index

In [70]:
# This function selects the action that leads gives the maximum state-action value function for the given state
def select_max_state_action_value_function_action(state_index, max_number_of_actions, state_action_value_function):
    max_action_value = np.max(a = state_action_value_function[state_index, :])
    max_action_stack = np.extract(condition = state_action_value_function[state_index, :] == max_action_value, arr = np.arange(max_number_of_actions))

    next_action_index = np.random.choice(a = max_action_stack)

    return next_action_index

In [71]:
# This function updates what state and actions the model has seen
def update_model_seen_state_actions(state_index, action_index, model):
    # Check to see if state has already been visited
    if model.number_of_seen_non_terminal_states == 0 or (model.seen_non_terminal_states_stack_reverse_lookup[state_index] == 0 and model.seen_non_terminal_states_stack[0] != state_index): # if new state
        # Add to state stack
        model.seen_non_terminal_states_stack[model.number_of_seen_non_terminal_states] = state_index # 1, 3, 2, 0, 4
        model.seen_non_terminal_states_stack_reverse_lookup[state_index] = model.number_of_seen_non_terminal_states # 3, 0, 2, 1, 4

        # Add to action stack
        model.seen_non_terminal_states_actions_stack[state_index][model.number_of_seen_non_terminal_states_actions[state_index]] = action_index # 2, 0, 3, 1
        model.seen_non_terminal_states_actions_stack_reverse_lookup[state_index][action_index] = model.number_of_seen_non_terminal_states_actions[state_index] # 1, 3, 0, 2

        # Increment counters
        model.number_of_seen_non_terminal_states_actions[state_index] += 1
        model.number_of_seen_non_terminal_states += 1
    else: # if already visited state
        # Check to see if action has already been visited
        if model.seen_non_terminal_states_actions_stack_reverse_lookup[state_index][action_index] == 0 and model.seen_non_terminal_states_actions_stack[state_index][0] != action_index:
            # Add to action stack
            model.seen_non_terminal_states_actions_stack[state_index][model.number_of_seen_non_terminal_states_actions[state_index]] = action_index # 2, 0, 3, 1
            model.seen_non_terminal_states_actions_stack_reverse_lookup[state_index][action_index] = model.number_of_seen_non_terminal_states_actions[state_index] # 1, 3, 0, 2

            # Increment counters
            model.number_of_seen_non_terminal_states_actions[state_index] += 1

    return model

In [72]:
# This function updates the model from environment experience
def update_model_of_environment_from_experience(state_index, action_index, reward, next_state_index, model):
    # Update model successor arrays
    if next_state_index in model.state_action_successor_state_indices[state_index, action_index]:
        model.successor_index = model.state_action_successor_state_indices[state_index, action_index].index(next_state_index)
        model.state_action_successor_state_number_of_visits[state_index, action_index][model.successor_index] += 1
    else:
        model.number_of_state_action_successor_states[state_index, action_index] += 1
        model.state_action_successor_state_indices[state_index, action_index].append(next_state_index)
        model.state_action_successor_state_rewards[state_index, action_index].append(reward)
        model.state_action_successor_state_number_of_visits[state_index, action_index].append(1)

    model.state_action_successor_state_number_of_visits_sum = np.sum(a = np.asarray(a = model.state_action_successor_state_number_of_visits[state_index, action_index]))
    model.state_action_successor_state_transition_probabilities[state_index, action_index] = [float(model.state_action_successor_state_number_of_visits[state_index, action_index][successor_index]) / model.state_action_successor_state_number_of_visits_sum for successor_index in range(0, model.number_of_state_action_successor_states[state_index, action_index])]
    
    # Update model state predecessors
    if (state_index, action_index) not in model.state_predecessor_state_action_pairs[next_state_index]:
        model.state_predecessor_state_action_pairs[next_state_index].append((state_index, action_index))
        
        model.number_of_state_predecessor_state_action_pairs[next_state_index] += 1

    return model

In [73]:
def model_simualate_planning(number_of_planning_steps, number_of_non_terminal_states, max_number_of_actions, model, alpha, discounting_factor_gamma, theta, state_action_value_function, current_priority_queue_size, priority_queue):
    for i in range(0, number_of_planning_steps):
        # Check if priority queue is empty
        if current_priority_queue_size == 0:
            break # break i loop since priority queue is empty
            
        # Get max priority state-action pair from queue
        state_index, action_index, priority_queue, current_priority_queue_size = pop_max_node_from_priority_queue(current_priority_queue_size, priority_queue)
        
        # Get reward
        reward, successor_state_transition_index = observe_reward(state_index, action_index, model)

        # Get next state
        next_state_index = model.state_action_successor_state_indices[state_index, action_index][successor_state_transition_index]
        
        # Check to see if we actioned into a terminal state
        if next_state_index >= number_of_non_terminal_states:
            state_action_value_function[state_index, action_index] += alpha * (reward - state_action_value_function[state_index, action_index])
        else:
            # Get next action, max action of next state
            next_action_index = select_max_state_action_value_function_action(next_state_index, max_number_of_actions, state_action_value_function)

            # Calculate state-action-function using quintuple SARSargmax(a,Q)
            state_action_value_function[state_index, action_index] += alpha * (reward + discounting_factor_gamma * state_action_value_function[next_state_index, next_action_index] - state_action_value_function[state_index, action_index])
        
        # Loop for all predicted Sbar and Abar to lead to S
        for j in range(0, model.number_of_state_predecessor_state_action_pairs[state_index]):
            predecessor_state_index = model.state_predecessor_state_action_pairs[state_index][j][0]
            predecessor_action_index = model.state_predecessor_state_action_pairs[state_index][j][1]

            # Get reward
            if state_index in model.state_action_successor_state_indices[predecessor_state_index, predecessor_action_index]:
                successor_state_transition_index = model.state_action_successor_state_indices[predecessor_state_index, predecessor_action_index].index(state_index)
            
            # Get reward from predecessor state and action
            reward = model.state_action_successor_state_rewards[state_index, action_index][successor_state_transition_index]

            # Get next action, max action of next state
            next_action_index = select_max_state_action_value_function_action(state_index, max_number_of_actions, state_action_value_function)

            # Calculate priority
            priority = np.abs(reward + discounting_factor_gamma * state_action_value_function[state_index, next_action_index] - state_action_value_function[predecessor_state_index, predecessor_action_index])
              
            # Check if priority is over threshold to add to priority queue
            if priority > theta:
                priority_queue, current_priority_queue_size = search_and_update_priority_queue(predecessor_state_index, predecessor_action_index, priority, current_priority_queue_size, priority_queue)
    
    return state_action_value_function, priority_queue, current_priority_queue_size

In [74]:
# This function searches for and updates a node in the priority queue
def search_and_update_priority_queue(state_index, action_index, priority, current_priority_queue_size, priority_queue):
    priority_queue_index = -1
    priority_queue_index = search_priority_queue(state_index, action_index, current_priority_queue_size, priority_queue)
    
    # Check if node was found
    if priority_queue_index >= 0:
        # Check if found node has a lower priority saved than new priority
        if priority_queue[priority_queue_index].priority < priority:
            priority_queue = priority_queue_node_increase_priority(priority_queue_index, priority, priority_queue)
    else:
        # Node wasn't found so insert into priority queue
        priority_queue, current_priority_queue_size = insert_into_priority_queue(state_index, action_index, priority, current_priority_queue_size, priority_queue)

    return priority_queue, current_priority_queue_size

# This function searches for a node in the priority queue
def search_priority_queue(state_index, action_index, current_priority_queue_size, priority_queue):
    priority_queue_index = -1

    # Search up to all nodes in worst case
    for i in range(0, current_priority_queue_size):
        if priority_queue[i].state_index == state_index and priority_queue[i].action_index == action_index:
            priority_queue_index = i
            break # break i loop since we found node

    return priority_queue_index

# This function increases priority at priority_queue_index to new_priority, where it is assumed that new_priority is greater than priority_queue[priority_queue_index]
def priority_queue_node_increase_priority(priority_queue_index, new_priority, priority_queue):
    priority_queue[priority_queue_index].priority = new_priority
    
    while priority_queue_index != 0 and priority_queue[parent_priority_queue_node_index(priority_queue_index)].priority < priority_queue[priority_queue_index].priority:
        priority_queue[priority_queue_index], priority_queue[parent_priority_queue_node_index(priority_queue_index)] = swap_priority_queue_nodes(priority_queue[priority_queue_index], priority_queue[parent_priority_queue_node_index(priority_queue_index)])
        priority_queue_index = parent_priority_queue_node_index(priority_queue_index)

    return priority_queue

# This function inserts a node into the priority queue
def insert_into_priority_queue(state_index, action_index, priority, current_priority_queue_size, priority_queue):
    # First insert the new node at the end
    current_priority_queue_size += 1
    priority_queue_index = current_priority_queue_size - 1
    
    priority_queue[priority_queue_index].state_index = state_index
    priority_queue[priority_queue_index].action_index = action_index
    priority_queue[priority_queue_index].priority = priority

    # Fix the max heap property if it is violated
    while priority_queue_index != 0 and priority_queue[parent_priority_queue_node_index(priority_queue_index)].priority < priority_queue[priority_queue_index].priority:
        priority_queue[priority_queue_index], priority_queue[parent_priority_queue_node_index(priority_queue_index)] = swap_priority_queue_nodes(priority_queue[priority_queue_index], priority_queue[parent_priority_queue_node_index(priority_queue_index)])
        priority_queue_index = parent_priority_queue_node_index(priority_queue_index)

    return priority_queue, current_priority_queue_size

# This function pops max node off from priority queue
def pop_max_node_from_priority_queue(current_priority_queue_size, priority_queue):
    if current_priority_queue_size == 1:
        current_priority_queue_size -= 1
        return priority_queue[0].state_index, priority_queue[0].action_index, priority_queue, current_priority_queue_size

    # Store the maximum value, and remove it from heap
    state_index = priority_queue[0].state_index
    action_index = priority_queue[0].action_index

    priority_queue[0].state_index = priority_queue[current_priority_queue_size - 1].state_index
    priority_queue[0].action_index = priority_queue[current_priority_queue_size - 1].action_index
    priority_queue[0].priority = priority_queue[current_priority_queue_size - 1].priority
    current_priority_queue_size -= 1

    # Fix the max heap property if it is violated
    priority_queue = max_heapify_priority_queue(0, current_priority_queue_size, priority_queue)
    
    return state_index, action_index, priority_queue, current_priority_queue_size

# This function recursively heapifies a subtree with the root at given index, however assumes that the subtrees are already heapified
def max_heapify_priority_queue(priority_queue_index, current_priority_queue_size, priority_queue):
    l = left_priority_queue_node_index(priority_queue_index)
    r = right_priority_queue_node_index(priority_queue_index)
    biggest = priority_queue_index

    if l < current_priority_queue_size and priority_queue[l].priority > priority_queue[priority_queue_index].priority:
        biggest = l

    if r < current_priority_queue_size and priority_queue[r].priority > priority_queue[biggest].priority:
        biggest = r

    if biggest != priority_queue_index:
        temp_state_index = priority_queue[priority_queue_index].state_index
        temp_action_index = priority_queue[priority_queue_index].action_index
        temp_priority = priority_queue[priority_queue_index].priority

        priority_queue[priority_queue_index].state_index = priority_queue[biggest].state_index
        priority_queue[priority_queue_index].action_index = priority_queue[biggest].action_index
        priority_queue[priority_queue_index].priority = priority_queue[biggest].priority

        priority_queue[biggest].state_index = temp_state_index
        priority_queue[biggest].action_index = temp_action_index
        priority_queue[biggest].priority = temp_priority

        priority_queue = max_heapify_priority_queue(biggest, current_priority_queue_size, priority_queue)

    return priority_queue

def swap_priority_queue_nodes(x, y):
    temp_state_index = x.state_index
    temp_action_index = x.action_index
    temp_priority = x.priority
    
    x.state_index = y.state_index
    x.action_index = y.action_index
    x.priority = y.priority
    
    y.state_index = temp_state_index
    y.action_index = temp_action_index
    y.priority = temp_priority

    return x, y

# This function gets the parent index of the given priority queue node's index
def parent_priority_queue_node_index(priority_queue_index):
    return (priority_queue_index - 1) // 2

# This function gets the left child index of the given priority queue node's index
def left_priority_queue_node_index(priority_queue_index):
    return (2 * priority_queue_index + 1)

# This function gets the right child index of the given priority queue node's index
def right_priority_queue_node_index(priority_queue_index):
    return (2 * priority_queue_index + 2)

In [75]:
def off_policy_planning_and_learning_prioritized_sweeping(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, theta, maximum_episode_length, number_of_planning_steps, current_priority_queue_size, priority_queue):
    for episode in range(0, number_of_episodes):
        # Initialize episode to get initial state
        initial_state_index = initialize_epsiode(number_of_non_terminal_states)

        # Loop through episode and update the policy
        state_action_value_function, policy, model = loop_through_episode(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, theta, maximum_episode_length, number_of_planning_steps, initial_state_index, current_priority_queue_size, priority_queue)
    
    return state_action_value_function, policy, model

# Run algorithm

In [76]:
# Print initial arrays
print("\nInitial state-action value function")
print(state_action_value_function)

print("\nInitial policy")
print(policy)

# Run off policy planning and learning prioritized sweeping
state_action_value_function, policy, model = off_policy_planning_and_learning_prioritized_sweeping(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, theta, maximum_episode_length, number_of_planning_steps, current_priority_queue_size, priority_queue)

# Print final results
print("\nFinal state-action value function")
print(state_action_value_function)

print("\nFinal policy")
print(policy)


Initial state-action value function
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function
[[-3. -2. -1. -3.]
 [-4. -3. -2. -4.]
 [-4. -4. -3. -3.]
 [-3. -1. -2. -3.]
 [-4. -2. -2. -4.]
 [-3. -3. -3. -3.]
 [-3. -4. -4. -2.]
 [-4. -2. -3. -4.]
 [-3. -3. -3. -3.]
 [-2. -4. -4. -2.]
 [-2. -3. -3. -1.]
 [-3. -3. -4. -4.]
 [-2. -4. -4. -3.]
 [-1. -3. -3. -2.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]

Final policy
[[0.03333333 0.03333333 0.9  

In [77]:
# Print model seen arrays
print("model.number_of_seen_non_terminal_states")
print(model.number_of_seen_non_terminal_states)
print("model.seen_non_terminal_states_stack")
print(model.seen_non_terminal_states_stack)
print("model.seen_non_terminal_states_stack_reverse_lookup")
print(model.seen_non_terminal_states_stack_reverse_lookup)
print("model.number_of_seen_non_terminal_states_actions")
print(model.number_of_seen_non_terminal_states_actions)
print("model.seen_non_terminal_states_actions_stack")
print(model.seen_non_terminal_states_actions_stack)
print("model.seen_non_terminal_states_actions_stack_reverse_lookup")
print(model.seen_non_terminal_states_actions_stack_reverse_lookup)

model.number_of_seen_non_terminal_states
14
model.seen_non_terminal_states_stack
[12 11  7  3 13  8  4  5  6  2  1  0  9 10]
model.seen_non_terminal_states_stack_reverse_lookup
[11 10  9  3  6  7  8  2  5 12 13  1  0  4]
model.number_of_seen_non_terminal_states_actions
[4 4 4 4 4 4 4 4 4 4 4 4 4 4]
model.seen_non_terminal_states_actions_stack
[[0 2 3 1]
 [0 2 3 1]
 [0 2 1 3]
 [3 2 1 0]
 [0 2 3 1]
 [0 1 3 2]
 [1 0 2 3]
 [1 2 3 0]
 [1 0 2 3]
 [0 2 1 3]
 [3 0 2 1]
 [2 1 3 0]
 [2 0 1 3]
 [2 1 3 0]]
model.seen_non_terminal_states_actions_stack_reverse_lookup
[[0 3 1 2]
 [0 3 1 2]
 [0 2 1 3]
 [3 2 1 0]
 [0 3 1 2]
 [0 1 3 2]
 [1 0 2 3]
 [3 0 1 2]
 [1 0 2 3]
 [0 2 1 3]
 [1 3 2 0]
 [3 1 0 2]
 [1 2 0 3]
 [3 1 0 2]]


In [78]:
# Print model successor arrays
print("model.number_of_state_action_successor_states")
print(model.number_of_state_action_successor_states)
print("model.state_action_successor_state_indices")
print(model.state_action_successor_state_indices)
print("model.state_action_successor_state_transition_probabilities")
print(model.state_action_successor_state_transition_probabilities)
print("model.state_action_successor_state_rewards")
print(model.state_action_successor_state_rewards)
print("model.state_action_successor_state_number_of_visits")
print(model.state_action_successor_state_number_of_visits)

model.number_of_state_action_successor_states
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [0 0 0 0]
 [0 0 0 0]]
model.state_action_successor_state_indices
[[list([1]) list([0]) list([14]) list([4])]
 [list([2]) list([1]) list([0]) list([5])]
 [list([2]) list([2]) list([1]) list([6])]
 [list([4]) list([14]) list([3]) list([7])]
 [list([5]) list([0]) list([3]) list([8])]
 [list([6]) list([1]) list([4]) list([9])]
 [list([6]) list([2]) list([5]) list([10])]
 [list([8]) list([3]) list([7]) list([11])]
 [list([9]) list([4]) list([7]) list([12])]
 [list([10]) list([5]) list([8]) list([13])]
 [list([10]) list([6]) list([9]) list([15])]
 [list([12]) list([7]) list([11]) list([11])]
 [list([13]) list([8]) list([11]) list([12])]
 [list([15]) list([9]) list([12]) list([13])]
 [list([]) list([]) list([]) list([])]
 [list([]) list([]) list([]) list([])]]
model.state_action_successor_state_