In [141]:
import numpy as np

# Create environment

In [41]:
num_states = 16
num_terminal_states = 2
num_non_terminal_states = num_states - num_terminal_states

In [42]:
max_num_actions = 4

In [43]:
num_actions_per_non_terminal_state = np.repeat(
    a=max_num_actions, repeats=num_non_terminal_states)

In [44]:
num_state_action_successor_states = np.repeat(
    a=1, repeats=num_states * max_num_actions)

In [45]:
num_state_action_successor_states = np.reshape(
    a=num_state_action_successor_states,
    newshape=(num_states, max_num_actions))

In [46]:
sp_idx = np.array(
    object=[1, 0, 14, 4,
            2, 1, 0, 5,
            2, 2, 1, 6,
            4, 14, 3, 7,
            5, 0, 3, 8,
            6, 1, 4, 9,
            6, 2, 5, 10,
            8, 3, 7, 11,
            9, 4, 7, 12,
            10, 5, 8, 13,
            10, 6, 9, 15,
            12, 7, 11, 11,
            13, 8, 11, 12,
            15, 9, 12, 13],
    dtype=np.int64)

In [47]:
p = np.repeat(a=1.0, repeats=num_non_terminal_states * max_num_actions * 1)

In [48]:
r = np.repeat(a=-1.0, repeats=num_non_terminal_states * max_num_actions * 1)

In [49]:
sp_idx = np.reshape(
    a=sp_idx,
    newshape=(num_non_terminal_states, max_num_actions, 1))
p = np.reshape(
    a=p,
    newshape=(num_non_terminal_states, max_num_actions, 1))
r = np.reshape(
    a=r,
    newshape=(num_non_terminal_states, max_num_actions, 1))

# Set hyperparameters

In [151]:
# Set the number of episodes
num_episodes = 10000
# Set the maximum episode length
maximum_episode_length = 200
# Set learning rate alpha
alpha = 0.1
# Set epsilon for our epsilon level of exploration
epsilon = 0.1
# Set discounting factor gamma
gamma = 1.0

# Create value function and policy arrays

In [152]:
q1 = np.repeat(a=0.0, repeats=num_states * max_num_actions)
q1 = np.reshape(a=q1, newshape=(num_states, max_num_actions))

q2 = np.repeat(a=0.0, repeats=num_states * max_num_actions)
q2 = np.reshape(a=q2, newshape=(num_states, max_num_actions))

In [153]:
policy = np.repeat(
    a=1.0 / max_num_actions, repeats=num_non_terminal_states * max_num_actions)
policy = np.reshape(
    a=policy, newshape=(num_non_terminal_states, max_num_actions))

# Create algorithm

In [154]:
# Set random seed so that everything is reproducible
np.random.seed(seed=0)

In [155]:
# This function initializes episodes
def initialize_epsiode(num_non_terminal_states):
    # Initial state
    # Randomly choose an initial state from all non-terminal states
    init_s_idx = np.random.randint(
        low=0, high=num_non_terminal_states, dtype=np.int64)

    return init_s_idx

In [156]:
# This function selects a policy greedily from the state-action-value function
def epsilon_greedy_policy_from_state_action_function(
        max_num_actions, q1, q2, epsilon, s_idx, policy):
    # Combine state-action value functions
    q = q1[s_idx, :] + q2[s_idx, :]

    # Save max state-action value and find the number of actions that have the
    # same max state-action value
    max_action_value = np.max(a=q)
    max_action_count = np.count_nonzero(a=q == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs
    # that have the same value and zero otherwise
    if max_action_count == max_num_actions:
        max_policy_prob_per_action = 1.0 / max_action_count
        remain_prob_per_action = 0.0
    else:
        max_policy_prob_per_action = (1.0 - epsilon) / max_action_count
        remain_prob_per_action = epsilon / (max_num_actions - max_action_count)

    policy[s_idx, :] = np.where(
        q == max_action_value,
        max_policy_prob_per_action,
        remain_prob_per_action)

    return policy

In [157]:
# This function loops through episodes and updates the policy
def loop_through_episode(
        num_non_terminal_states,
        max_num_actions,
        num_state_action_successor_states,
        sp_idx,
        p,
        r,
        q1,
        q2,
        policy,
        alpha,
        epsilon,
        gamma,
        maximum_episode_length,
        s_idx):
    # Loop through episode steps until termination
    for t in range(0, maximum_episode_length):
        # Choose policy for chosen state by epsilon-greedy choosing from the
        # state-action-value function
        policy = epsilon_greedy_policy_from_state_action_function(
            max_num_actions, q1, q2, epsilon, s_idx, policy)

        # Get epsilon-greedy action
        a_idx = np.random.choice(
            a=max_num_actions, p=policy[s_idx, :])

        # Get reward
        successor_state_transition_idx = np.random.choice(
            a=num_state_action_successor_states[s_idx, a_idx],
            p=p[s_idx, a_idx, :])

        reward = r[s_idx, a_idx, successor_state_transition_idx]

        # Get next state
        next_s_idx = sp_idx[s_idx, a_idx, successor_state_transition_idx]

        # Update state action value equally randomly selecting from the
        # state-action-value functions
        if (np.random.randint(low=0, high=2, dtype=np.int64) == 0):
            q1, q2, policy, s_idx = update_q(
                num_non_terminal_states,
                max_num_actions,
                q2,
                policy,
                alpha,
                epsilon,
                gamma,
                s_idx,
                a_idx,
                reward,
                next_s_idx,
                q1)
        else:
            q2, q1, policy, s_idx = update_q(
                num_non_terminal_states,
                max_num_actions,
                q1,
                policy,
                alpha,
                epsilon,
                gamma,
                s_idx,
                a_idx,
                reward,
                next_s_idx,
                q2)

        if next_s_idx >= num_non_terminal_states:
            break  # episode terminated since we ended up in a terminal state

    return q1, q2, policy

In [158]:
# This function updates the state-action-value function
def update_q(
        num_non_terminal_states,
        max_num_actions,
        not_updating_q,
        policy,
        alpha,
        epsilon,
        gamma,
        s_idx,
        a_idx,
        reward,
        next_s_idx,
        updating_q):
    # Check to see if we actioned into a terminal state
    if next_s_idx >= num_non_terminal_states:
        updating_q[s_idx, a_idx] += alpha * (reward - updating_q[s_idx, a_idx])
    else:
        # Get next action, using expectation value
        not_updating_v_expected_value_on_policy = np.sum(
            a=policy[next_s_idx, :] * not_updating_q[next_s_idx, :])

        # Calculate state-action-function expectation
        delta = gamma * not_updating_v_expected_value_on_policy
        delta -= updating_q[s_idx, a_idx]
        updating_q[s_idx, a_idx] += alpha * (reward + delta)

        # Update state and action to next state and action
        s_idx = next_s_idx
        
    return updating_q, not_updating_q, policy, s_idx

In [159]:
def on_policy_temporal_difference_double_expected_sarsa(
        num_non_terminal_states,
        max_num_actions,
        num_state_action_successor_states,
        sp_idx,
        p,
        r,
        q1,
        q2,
        policy,
        alpha,
        epsilon,
        gamma,
        maximum_episode_length):
    for episode in range(0, num_episodes):
        # Initialize episode to get initial state
        init_s_idx = initialize_epsiode(num_non_terminal_states)

        # Loop through episode and update the policy
        q1, q2, policy = loop_through_episode(
            num_non_terminal_states,
            max_num_actions,
            num_state_action_successor_states,
            sp_idx,
            p,
            r,
            q1,
            q2,
            policy,
            alpha,
            epsilon,
            gamma,
            maximum_episode_length,
            init_s_idx)

    return q1, q2, policy

# Run algorithm

In [160]:
# Print initial arrays
print("\nInitial state-action value function1")
print(q1)

print("\nInitial state-action value function2")
print(q2)

print("\nInitial policy")
print(policy)

# Run on policy temporal difference double expected sarsa
q1, q2, policy = on_policy_temporal_difference_double_expected_sarsa(
    num_non_terminal_states,
    max_num_actions,
    num_state_action_successor_states,
    sp_idx,
    p,
    r,
    q1,
    q2,
    policy,
    alpha,
    epsilon,
    gamma,
    maximum_episode_length)

# Print final results
print("\nFinal state-action value function1")
print(q1)

print("\nFinal state-action value function2")
print(q2)

print("\nFinal policy")
print(policy)


Initial state-action value function1
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial state-action value function2
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function1
[[-3.29893306 -2.15890896 -1.         -3.29412976]
 [-4.04716222 -3