In [1]:
import numpy as np

# Create environment

In [2]:
number_of_states = 16
number_of_terminal_states = 2
number_of_non_terminal_states = number_of_states - number_of_terminal_states

In [3]:
max_number_of_actions = 4

In [4]:
number_of_actions_per_non_terminal_state = np.repeat(a = max_number_of_actions, repeats = number_of_non_terminal_states)

In [5]:
# Create class to hold all environment properties in
class Environment:
    def __init__(self, number_of_states, number_of_non_terminal_states, max_number_of_actions):
        # Create environment state-action successor state arrrays
        self.number_of_state_action_successor_states = np.ones(shape = [number_of_states, max_number_of_actions], dtype = np.int64)

        self.state_action_successor_state_indices = np.reshape(a= np.array([1, 0, 14, 4, 2, 1, 0, 5, 2, 2, 1, 6, 4, 14, 3, 7, 5, 0, 3, 8, 6, 1, 4, 9, 6, 2, 5, 10, 8, 3, 7, 11, 9, 4, 7, 12, 10, 5, 8, 13, 10, 6, 9, 15, 12, 7, 11, 11, 13, 8, 11, 12, 15, 9, 12, 13], dtype = np.int64), newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
        self.state_action_successor_state_transition_probabilities = np.reshape(a = np.repeat(a = 1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1), newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
        self.state_action_successor_state_rewards = np.reshape(a = np.repeat(a = -1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1), newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
        
environment = Environment(number_of_states, number_of_non_terminal_states, max_number_of_actions)

# Create model

In [6]:
# Create class to hold all model properties in
class Model:
    def __init__(self, number_of_states, number_of_non_terminal_states, max_number_of_actions):
        # Create model state visit counters
        self.number_of_seen_non_terminal_states = 0
        self.seen_non_terminal_states_stack = np.zeros(shape = [number_of_non_terminal_states], dtype = np.int64)
        self.seen_non_terminal_states_stack_reverse_lookup = np.zeros(shape = [number_of_non_terminal_states], dtype = np.int64)
        
        # Create model state-action visit counters
        self.number_of_seen_non_terminal_states_actions = np.zeros(shape = [number_of_non_terminal_states], dtype = np.int64)
        self.seen_non_terminal_states_actions_stack = np.zeros(shape = [number_of_non_terminal_states, max_number_of_actions], dtype = np.int64)
        self.seen_non_terminal_states_actions_stack_reverse_lookup = np.zeros(shape = [number_of_non_terminal_states, max_number_of_actions], dtype = np.int64)
        self.state_action_time_since_last_visit = np.zeros(shape = [number_of_non_terminal_states, max_number_of_actions], dtype = np.int64)
        
        # Create model state-action successor state arrrays
        self.number_of_state_action_successor_states = np.zeros(shape = [number_of_states, max_number_of_actions], dtype = np.int64)

        self.state_action_successor_state_indices = np.array(object = [[[0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        self.state_action_successor_state_transition_probabilities = np.array(object = [[[0.0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        self.state_action_successor_state_rewards = np.array(object = [[[0.0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        self.state_action_successor_state_number_of_visits = np.array(object = [[[0] if state_index == 0 and action_index == 0 else [] for action_index in range(0, max_number_of_actions)] for state_index in range(0, number_of_states)], dtype = np.object)
        del self.state_action_successor_state_indices[0, 0][0]
        del self.state_action_successor_state_transition_probabilities[0, 0][0]
        del self.state_action_successor_state_rewards[0, 0][0]
        del self.state_action_successor_state_number_of_visits[0, 0][0]
        
model = Model(number_of_states, number_of_non_terminal_states, max_number_of_actions)

# Set hyperparameters

In [7]:
# Set the number of episodes
number_of_episodes = 40000
# Set the maximum episode length
maximum_episode_length = 2000
# Set the number of steps for the planning stage
number_of_planning_steps = 5
# Set learning rate alpha
alpha = 0.001
# Set epsilon for our epsilon level of exploration
epsilon = 0.05
# Set discounting factor gamma
discounting_factor_gamma = 1.0

# Create value function and policy arrays

In [8]:
state_action_value_function = np.repeat(a = 0.0, repeats = number_of_states * max_number_of_actions)
state_action_value_function = np.reshape(a = state_action_value_function, newshape = (number_of_states, max_number_of_actions))

In [9]:
policy = np.repeat(a = 1.0 / max_number_of_actions, repeats = number_of_non_terminal_states * max_number_of_actions)
policy = np.reshape(a = policy, newshape = (number_of_non_terminal_states, max_number_of_actions))

# Create algorithm

In [10]:
# Set random seed so that everything is reproducible
np.random.seed(seed = 0)

In [11]:
# This function initializes episodes
def initialize_epsiode(number_of_non_terminal_states):
    # Initial state
    initial_state_index = np.random.randint(low = 0, high = number_of_non_terminal_states, dtype = np.int64) # randomly choose an initial state from all non-terminal states

    return initial_state_index

In [12]:
# This function selects a policy greedily from the state-action-value function
def epsilon_greedy_policy_from_state_action_function(max_number_of_actions, state_action_value_function, epsilon, state_index, policy):
    # Save max state-action value and find the number of actions that have the same max state-action value
    max_action_value = np.max(a = state_action_value_function[state_index, :])
    max_action_count = np.count_nonzero(a = state_action_value_function[state_index, :] == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs that have the same value and zero otherwise
    if max_action_count == max_number_of_actions:
        max_policy_apportioned_probability_per_action = 1.0 / max_action_count
        remaining_apportioned_probability_per_action = 0.0
    else:
        max_policy_apportioned_probability_per_action = (1.0 - epsilon) / max_action_count
        remaining_apportioned_probability_per_action = epsilon / (max_number_of_actions - max_action_count)

    policy[state_index, :] = np.where(state_action_value_function[state_index, :] == max_action_value, max_policy_apportioned_probability_per_action, remaining_apportioned_probability_per_action)

    return policy

In [13]:
# This function loops through episodes and updates the policy
def loop_through_episode(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length, number_of_planning_steps, state_index):
    # Loop through episode steps until termination
    for t in range(0, maximum_episode_length):
        # Choose policy for chosen state by epsilon-greedy choosing from the state-action-value function
        policy = epsilon_greedy_policy_from_state_action_function(max_number_of_actions, state_action_value_function, epsilon, state_index, policy)

        # Get epsilon-greedy action
        action_index = np.random.choice(a = max_number_of_actions, p = policy[state_index, :])
        
        # Update what state and actions the model has seen
        model = update_model_seen_state_actions(state_index, action_index, model)
        
        # Get reward
        successor_state_transition_index = np.random.choice(a = environment.number_of_state_action_successor_states[state_index, action_index], p = environment.state_action_successor_state_transition_probabilities[state_index, action_index, :])

        reward = environment.state_action_successor_state_rewards[state_index, action_index, successor_state_transition_index]

        # Get next state
        next_state_index = environment.state_action_successor_state_indices[state_index, action_index, successor_state_transition_index]
            
        # Check to see if we actioned into a terminal state
        if next_state_index >= number_of_non_terminal_states:
            # Update state-action value function
            state_action_value_function[state_index, action_index] += alpha * (reward - state_action_value_function[state_index, action_index])
            
            # Update model from environment experience
            model = update_model_of_environment_from_experience(state_index, action_index, reward, next_state_index, model)
            
            # Use updated model to simulate experience in planning phase
            state_action_value_function = model_simualate_planning(number_of_planning_steps, number_of_non_terminal_states, max_number_of_actions, model, alpha, discounting_factor_gamma, state_action_value_function)
            
            break; # episode terminated since we ended up in a terminal state
        else:
            # Get next action, max action of next state
            max_action_value = np.max(a = state_action_value_function[state_index, :])
            max_action_stack = np.extract(condition = state_action_value_function[state_index, :] == max_action_value, arr = np.arange(max_number_of_actions))
            
            next_action_index = np.random.choice(a = max_action_stack)

            # Calculate state-action-function using quintuple SARSA
            state_action_value_function[state_index, action_index] += alpha * (reward + discounting_factor_gamma * state_action_value_function[next_state_index, next_action_index] - state_action_value_function[state_index, action_index])
            
            # Update model from environment experience
            model = update_model_of_environment_from_experience(state_index, action_index, reward, next_state_index, model)
            
            # Use updated model to simulate experience in planning phase
            state_action_value_function = model_simualate_planning(number_of_planning_steps, number_of_non_terminal_states, max_number_of_actions, model, alpha, discounting_factor_gamma, state_action_value_function)

            # Update state and action to next state and action
            state_index = next_state_index
            action_index = next_action_index

    return state_action_value_function, policy, model

In [14]:
# This function updates what state and actions the model has seen
def update_model_seen_state_actions(state_index, action_index, model):
    # Check to see if state has already been visited
    if model.number_of_seen_non_terminal_states == 0 or (model.seen_non_terminal_states_stack_reverse_lookup[state_index] == 0 and model.seen_non_terminal_states_stack[0] != state_index): # if new state
        # Add to state stack
        model.seen_non_terminal_states_stack[model.number_of_seen_non_terminal_states] = state_index # 1, 3, 2, 0, 4
        model.seen_non_terminal_states_stack_reverse_lookup[state_index] = model.number_of_seen_non_terminal_states # 3, 0, 2, 1, 4

        # Add to action stack
        model.seen_non_terminal_states_actions_stack[state_index][model.number_of_seen_non_terminal_states_actions[state_index]] = action_index # 2, 0, 3, 1
        model.seen_non_terminal_states_actions_stack_reverse_lookup[state_index][action_index] = model.number_of_seen_non_terminal_states_actions[state_index] # 1, 3, 0, 2

        # Increment counters
        model.number_of_seen_non_terminal_states_actions[state_index] += 1
        model.number_of_seen_non_terminal_states += 1
    else: # if already visited state
        # Check to see if action has already been visited
        if model.seen_non_terminal_states_actions_stack_reverse_lookup[state_index][action_index] == 0 and model.seen_non_terminal_states_actions_stack[state_index][0] != action_index:
            # Add to action stack
            model.seen_non_terminal_states_actions_stack[state_index][model.number_of_seen_non_terminal_states_actions[state_index]] = action_index # 2, 0, 3, 1
            model.seen_non_terminal_states_actions_stack_reverse_lookup[state_index][action_index] = model.number_of_seen_non_terminal_states_actions[state_index] # 1, 3, 0, 2

            # Increment counters
            model.number_of_seen_non_terminal_states_actions[state_index] += 1

    return model

In [15]:
# This function updates the model from environment experience
def update_model_of_environment_from_experience(state_index, action_index, reward, next_state_index, model):
    if next_state_index in model.state_action_successor_state_indices[state_index, action_index]:
        model.successor_index = model.state_action_successor_state_indices[state_index, action_index].index(next_state_index)
        model.state_action_successor_state_number_of_visits[state_index, action_index][model.successor_index] += 1
    else:
        model.number_of_state_action_successor_states[state_index, action_index] += 1
        model.state_action_successor_state_indices[state_index, action_index].append(next_state_index)
        model.state_action_successor_state_rewards[state_index, action_index].append(reward)
        model.state_action_successor_state_number_of_visits[state_index, action_index].append(1)

    model.state_action_successor_state_number_of_visits_sum = np.sum(a = np.asarray(a = model.state_action_successor_state_number_of_visits[state_index, action_index]))
    model.state_action_successor_state_transition_probabilities[state_index, action_index] = [float(model.state_action_successor_state_number_of_visits[state_index, action_index][successor_index]) / model.state_action_successor_state_number_of_visits_sum for successor_index in range(0, model.number_of_state_action_successor_states[state_index, action_index])]
    
    return model

In [16]:
def model_simualate_planning(number_of_planning_steps, number_of_non_terminal_states, max_number_of_actions, model, alpha, discounting_factor_gamma, state_action_value_function):
    for i in range(0, number_of_planning_steps):
        # Randomly choose state indices from previously seen states
        state_index = model.seen_non_terminal_states_stack[np.random.randint(low = 0, high = model.number_of_seen_non_terminal_states, dtype = np.int64)]
        
        # Randomly choose action indices from previously seen actions in previously seen states
        action_index = model.seen_non_terminal_states_actions_stack[state_index, np.random.randint(low = 0, high = model.number_of_seen_non_terminal_states_actions[state_index], dtype = np.int64)]
        
        # Get reward
        successor_state_transition_index = np.random.choice(a = np.arange(model.number_of_state_action_successor_states[state_index, action_index]), p = np.asarray(a = model.state_action_successor_state_transition_probabilities[state_index, action_index], dtype = np.float64))
    
        # Get reward from state and action */
        reward = model.state_action_successor_state_rewards[state_index, action_index][successor_state_transition_index]

        # Get next state */
        next_state_index = model.state_action_successor_state_indices[state_index, action_index][successor_state_transition_index]

        # Check to see if we actioned into a terminal state */
        if next_state_index >= number_of_non_terminal_states:
            state_action_value_function[state_index, action_index] += alpha * (reward - state_action_value_function[state_index, action_index])
        else:
            # Get next action, max action of next state
            max_action_value = np.max(a = state_action_value_function[state_index, :])
            max_action_stack = np.extract(condition = state_action_value_function[state_index, :] == max_action_value, arr = np.arange(max_number_of_actions))
            
            next_action_index = np.random.choice(a = max_action_stack)

            # Calculate state-action-function using quintuple SARSargmax(a,Q) */
            state_action_value_function[state_index, action_index] += alpha * (reward + discounting_factor_gamma * state_action_value_function[next_state_index, next_action_index] - state_action_value_function[state_index, action_index]);
            
    return state_action_value_function

In [17]:
def off_policy_planning_and_learning_tabular_dyna_q(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length, number_of_planning_steps):
    for episode in range(0, number_of_episodes):
        # Initialize episode to get initial state
        initial_state_index = initialize_epsiode(number_of_non_terminal_states)

        # Loop through episode and update the policy
        state_action_value_function, policy, model = loop_through_episode(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length, number_of_planning_steps, initial_state_index)
    
    return state_action_value_function, policy, model

# Run algorithm

In [18]:
# Print initial arrays
print("\nInitial state-action value function")
print(state_action_value_function)

print("\nInitial policy")
print(policy)

# Run off policy planning and learning tabular dyna-Q
state_action_value_function, policy, model = off_policy_planning_and_learning_tabular_dyna_q(number_of_non_terminal_states, max_number_of_actions, environment, model, state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length, number_of_planning_steps)

# Print final results
print("\nFinal state-action value function")
print(state_action_value_function)

print("\nFinal policy")
print(policy)


Initial state-action value function
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function
[[-2.99735237 -1.99925958 -0.9999999  -3.57418531]
 [-8.44248666 -2.9973631  -1.99995784 -8.35601611]
 [-3.99217238 -3.9916204  -8.41330405 -2.99978016]
 [-3.57407501 -0.99999999 -1.99918769 -2.99769535]
 [-8.4001543  -2.59300282 -2.5932972  -7.70072548]
 [-3.6297768  -8.43155974 -8.19430757 -3.62943777]
 [-2.99806178 -3.99270458 -4.60211943 -1.9

In [19]:
# Print model seen arrays
print("model.number_of_seen_non_terminal_states")
print(model.number_of_seen_non_terminal_states)
print("model.seen_non_terminal_states_stack")
print(model.seen_non_terminal_states_stack)
print("model.seen_non_terminal_states_stack_reverse_lookup")
print(model.seen_non_terminal_states_stack_reverse_lookup)
print("model.number_of_seen_non_terminal_states_actions")
print(model.number_of_seen_non_terminal_states_actions)
print("model.seen_non_terminal_states_actions_stack")
print(model.seen_non_terminal_states_actions_stack)
print("model.seen_non_terminal_states_actions_stack_reverse_lookup")
print(model.seen_non_terminal_states_actions_stack_reverse_lookup)

model.number_of_seen_non_terminal_states
14
model.seen_non_terminal_states_stack
[12 11  7  8 13  9  5  4  1  2  6 10  0  3]
model.seen_non_terminal_states_stack_reverse_lookup
[12  8  9 13  7  6 10  2  3  5 11  1  0  4]
model.number_of_seen_non_terminal_states_actions
[4 4 4 4 4 4 4 4 4 4 4 4 4 4]
model.seen_non_terminal_states_actions_stack
[[1 2 3 0]
 [0 3 2 1]
 [1 2 3 0]
 [1 3 0 2]
 [3 0 1 2]
 [2 1 0 3]
 [0 1 3 2]
 [3 0 2 1]
 [2 0 3 1]
 [1 2 3 0]
 [3 0 1 2]
 [3 2 1 0]
 [2 3 1 0]
 [2 1 3 0]]
model.seen_non_terminal_states_actions_stack_reverse_lookup
[[3 0 1 2]
 [0 3 2 1]
 [3 0 1 2]
 [2 0 3 1]
 [1 2 3 0]
 [2 1 0 3]
 [0 1 3 2]
 [1 3 2 0]
 [1 3 0 2]
 [3 0 1 2]
 [1 2 3 0]
 [3 2 1 0]
 [3 2 0 1]
 [3 1 0 2]]


In [20]:
# Print model successor arrays
print("model.number_of_state_action_successor_states")
print(model.number_of_state_action_successor_states)
print("model.state_action_successor_state_indices")
print(model.state_action_successor_state_indices)
print("model.state_action_successor_state_transition_probabilities")
print(model.state_action_successor_state_transition_probabilities)
print("model.state_action_successor_state_rewards")
print(model.state_action_successor_state_rewards)
print("model.state_action_successor_state_number_of_visits")
print(model.state_action_successor_state_number_of_visits)

model.number_of_state_action_successor_states
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [0 0 0 0]
 [0 0 0 0]]
model.state_action_successor_state_indices
[[list([1]) list([0]) list([14]) list([4])]
 [list([2]) list([1]) list([0]) list([5])]
 [list([2]) list([2]) list([1]) list([6])]
 [list([4]) list([14]) list([3]) list([7])]
 [list([5]) list([0]) list([3]) list([8])]
 [list([6]) list([1]) list([4]) list([9])]
 [list([6]) list([2]) list([5]) list([10])]
 [list([8]) list([3]) list([7]) list([11])]
 [list([9]) list([4]) list([7]) list([12])]
 [list([10]) list([5]) list([8]) list([13])]
 [list([10]) list([6]) list([9]) list([15])]
 [list([12]) list([7]) list([11]) list([11])]
 [list([13]) list([8]) list([11]) list([12])]
 [list([15]) list([9]) list([12]) list([13])]
 [list([]) list([]) list([]) list([])]
 [list([]) list([]) list([]) list([])]]
model.state_action_successor_state_