In [121]:
import numpy as np

# Create environment

In [122]:
number_of_states = 16
number_of_terminal_states = 2
number_of_non_terminal_states = number_of_states - number_of_terminal_states

In [123]:
max_number_of_actions = 4

In [124]:
number_of_actions_per_non_terminal_state = np.repeat(a = max_number_of_actions, repeats = number_of_non_terminal_states)

In [125]:
number_of_state_action_successor_states = np.repeat(a = 1, repeats = number_of_states * max_number_of_actions)

In [126]:
number_of_state_action_successor_states = np.reshape(a = number_of_state_action_successor_states, newshape = (number_of_states, max_number_of_actions))

In [127]:
state_action_successor_state_indices = np.array([1, 0, 14, 4, 2, 1, 0, 5, 2, 2, 1, 6, 4, 14, 3, 7, 5, 0, 3, 8, 6, 1, 4, 9, 6, 2, 5, 10, 8, 3, 7, 11, 9, 4, 7, 12, 10, 5, 8, 13, 10, 6, 9, 15, 12, 7, 11, 11, 13, 8, 11, 12, 15, 9, 12, 13], dtype = np.int64)

In [128]:
state_action_successor_state_transition_probabilities = np.repeat(a = 1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1)

In [129]:
state_action_successor_state_rewards = np.repeat(a = -1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1)

In [130]:
state_action_successor_state_indices = np.reshape(a = state_action_successor_state_indices, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_transition_probabilities = np.reshape(a = state_action_successor_state_transition_probabilities, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_rewards = np.reshape(a = state_action_successor_state_rewards, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))

# Set hyperparameters

In [131]:
# Set the number of episodes
number_of_episodes = 10000
# Set the maximum episode length
maximum_episode_length = 200
# Set learning rate alpha
alpha = 0.1
# Set epsilon for our epsilon level of exploration
epsilon = 0.1
# Set discounting factor gamma
discounting_factor_gamma = 1.0

# Create value function and policy arrays

In [132]:
state_action_value_function1 = np.repeat(a = 0.0, repeats = number_of_states * max_number_of_actions)
state_action_value_function1 = np.reshape(a = state_action_value_function1, newshape = (number_of_states, max_number_of_actions))

state_action_value_function2 = np.repeat(a = 0.0, repeats = number_of_states * max_number_of_actions)
state_action_value_function2 = np.reshape(a = state_action_value_function2, newshape = (number_of_states, max_number_of_actions))

In [133]:
policy = np.repeat(a = 1.0 / max_number_of_actions, repeats = number_of_non_terminal_states * max_number_of_actions)
policy = np.reshape(a = policy, newshape = (number_of_non_terminal_states, max_number_of_actions))

# Create algorithm

In [134]:
# Set random seed so that everything is reproducible
np.random.seed(seed = 0)

In [135]:
# This function initializes episodes
def initialize_epsiode(number_of_non_terminal_states):
    step_count = 0

    # Initial state
    initial_state_index = np.random.randint(low = 0, high = number_of_non_terminal_states, dtype = np.int64) # randomly choose an initial state from all non-terminal states

    return initial_state_index

In [136]:
# This function selects a policy greedily from the state-action-value function
def epsilon_greedy_policy_from_state_action_function(max_number_of_actions, state_action_value_function1, state_action_value_function2, epsilon, state_index, policy):
    # Combine state-action value functions
    state_action_value_function = state_action_value_function1[state_index, :] + state_action_value_function2[state_index, :]
    
    # Save max state-action value and find the number of actions that have the same max state-action value
    max_action_value = np.max(a = state_action_value_function)
    max_action_count = np.count_nonzero(a = state_action_value_function == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs that have the same value and zero otherwise
    if max_action_count == max_number_of_actions:
        max_policy_apportioned_probability_per_action = 1.0 / max_action_count
        remaining_apportioned_probability_per_action = 0.0
    else:
        max_policy_apportioned_probability_per_action = (1.0 - epsilon) / max_action_count
        remaining_apportioned_probability_per_action = epsilon / (max_number_of_actions - max_action_count)

    policy[state_index, :] = np.where(state_action_value_function == max_action_value, max_policy_apportioned_probability_per_action, remaining_apportioned_probability_per_action)

    return policy

In [137]:
# This function loops through episodes and updates the policy
def loop_through_episode(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function1, state_action_value_function2, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length, state_index):
    # Loop through episode steps until termination
    for t in range(0, maximum_episode_length):
        # Choose policy for chosen state by epsilon-greedy choosing from the state-action-value function
        policy = epsilon_greedy_policy_from_state_action_function(max_number_of_actions, state_action_value_function1, state_action_value_function2, epsilon, state_index, policy)

        # Get epsilon-greedy action
        action_index = np.random.choice(a = max_number_of_actions, p = policy[state_index, :])
        
        # Get reward
        successor_state_transition_index = np.random.choice(a = number_of_state_action_successor_states[state_index, action_index], p = state_action_successor_state_transition_probabilities[state_index, action_index, :])

        reward = state_action_successor_state_rewards[state_index, action_index, successor_state_transition_index]

        # Get next state
        next_state_index = state_action_successor_state_indices[state_index, action_index, successor_state_transition_index]

        if (np.random.randint(low = 0, high = 2, dtype = np.int64) == 0):
            state_action_value_function1, state_action_value_function2, policy, state_index = update_state_action_value_function(number_of_non_terminal_states, max_number_of_actions, state_action_value_function2, policy, alpha, epsilon, discounting_factor_gamma, state_index, action_index, reward, next_state_index, state_action_value_function1);
        else:
            state_action_value_function2, state_action_value_function1, policy, state_index = update_state_action_value_function(number_of_non_terminal_states, max_number_of_actions, state_action_value_function1, policy, alpha, epsilon, discounting_factor_gamma, state_index, action_index, reward, next_state_index, state_action_value_function2);

        if next_state_index >= number_of_non_terminal_states:
            break; # episode terminated since we ended up in a terminal state
        
    return state_action_value_function1, state_action_value_function2, policy

In [138]:
# This function updates the state-action-value function
def update_state_action_value_function(number_of_non_terminal_states, max_number_of_actions, not_updating_state_action_value_function, policy, alpha, epsilon, discounting_factor_gamma, state_index, action_index, reward, next_state_index, updating_state_action_value_function):
    # Check to see if we actioned into a terminal state
    if next_state_index >= number_of_non_terminal_states:
        updating_state_action_value_function[state_index, action_index] += alpha * (reward - updating_state_action_value_function[state_index, action_index])
    else:
        # Choose policy for chosen state by epsilon-greedy choosing from the state-action-value function */
        policy = epsilon_greedy_policy_from_state_action_function(max_number_of_actions, updating_state_action_value_function, not_updating_state_action_value_function, epsilon, next_state_index, policy)
        
        # Get next action, using expectation value
        not_updating_state_value_function_expected_value_on_policy = np.sum(a = policy[next_state_index, :] * not_updating_state_action_value_function[next_state_index, :])

        # Calculate state-action-function expectation
        updating_state_action_value_function[state_index, action_index] += alpha * (reward + discounting_factor_gamma * not_updating_state_value_function_expected_value_on_policy - updating_state_action_value_function[state_index, action_index])

        # Update state and action to next state and action
        state_index = next_state_index
    return updating_state_action_value_function, not_updating_state_action_value_function, policy, state_index

In [139]:
def on_policy_temporal_difference_double_expected_sarsa(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function1, state_action_value_function2, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length):
    for episode in range(0, number_of_episodes):
        # Initialize episode to get initial state
        initial_state_index = initialize_epsiode(number_of_non_terminal_states)

        # Loop through episode and update the policy
        state_action_value_function1, state_action_value_function2, policy = loop_through_episode(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function1, state_action_value_function2, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length, initial_state_index)
    
    return state_action_value_function1, state_action_value_function2, policy

# Run algorithm

In [140]:
# Print initial arrays
print("\nInitial state-action value function1")
print(state_action_value_function1)

print("\nInitial state-action value function2")
print(state_action_value_function2)

print("\nInitial policy")
print(policy)

# Run on policy temporal difference double expected sarsa
state_action_value_function1, state_action_value_function2, policy = on_policy_temporal_difference_double_expected_sarsa(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function1, state_action_value_function2, policy, alpha, epsilon, discounting_factor_gamma, maximum_episode_length)

# Print final results
print("\nFinal state-action value function1")
print(state_action_value_function1)

print("\nFinal state-action value function2")
print(state_action_value_function2)

print("\nFinal policy")
print(policy)


Initial state-action value function1
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial state-action value function2
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function1
[[-3.3074738  -2.164704   -1.         -3.28733294]
 [-4.13521782 -3