In [363]:
import numpy as np

# Create environment

In [364]:
number_of_states = 16
number_of_terminal_states = 2
number_of_non_terminal_states = number_of_states - number_of_terminal_states

In [365]:
max_number_of_actions = 4

In [366]:
number_of_actions_per_non_terminal_state = np.repeat(a = max_number_of_actions, repeats = number_of_non_terminal_states)

In [367]:
number_of_state_action_successor_states = np.repeat(a = 1, repeats = number_of_states * max_number_of_actions)

In [368]:
number_of_state_action_successor_states = np.reshape(a = number_of_state_action_successor_states, newshape = (number_of_states, max_number_of_actions))

In [369]:
state_action_successor_state_indices = np.array([1, 0, 14, 4, 2, 1, 0, 5, 2, 2, 1, 6, 4, 14, 3, 7, 5, 0, 3, 8, 6, 1, 4, 9, 6, 2, 5, 10, 8, 3, 7, 11, 9, 4, 7, 12, 10, 5, 8, 13, 10, 6, 9, 15, 12, 7, 11, 11, 13, 8, 11, 12, 15, 9, 12, 13], dtype = np.int64)

In [370]:
state_action_successor_state_transition_probabilities = np.repeat(a = 1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1)

In [371]:
state_action_successor_state_rewards = np.repeat(a = -1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1)

In [372]:
state_action_successor_state_indices = np.reshape(a = state_action_successor_state_indices, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_transition_probabilities = np.reshape(a = state_action_successor_state_transition_probabilities, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_rewards = np.reshape(a = state_action_successor_state_rewards, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))

# Set hyperparameters

In [373]:
#Set the n steps
n_steps = 4
# Set the number of episodes
number_of_episodes = 200000
# Set the maximum episode length
maximum_episode_length = 2000
# Set learning rate alpha
alpha = 0.05
# Set discounting factor gamma
discounting_factor_gamma = 1.0

# Create value function and policy arrays

In [374]:
# Create epsiode log
episode_log = {"state_index": np.repeat(a = -1, repeats = maximum_episode_length), 
               "action_index": np.repeat(a = -1, repeats = maximum_episode_length), 
               "reward": np.repeat(a = 0.0, repeats = maximum_episode_length)}

In [375]:
state_action_value_function = np.repeat(a = 0.0, repeats = number_of_states * max_number_of_actions)
state_action_value_function = np.reshape(a = state_action_value_function, newshape = (number_of_states, max_number_of_actions))

In [376]:
policy = np.repeat(a = 1.0 / max_number_of_actions, repeats = number_of_non_terminal_states * max_number_of_actions)
policy = np.reshape(a = policy, newshape = (number_of_non_terminal_states, max_number_of_actions))

# Create algorithm

In [377]:
# Set random seed so that everything is reproducible
np.random.seed(seed = 0)

In [378]:
# This function initializes episodes
def initialize_epsiode(number_of_non_terminal_states, max_number_of_actions, maximum_episode_length, policy, episode_log):
    # Initial state
    episode_log["state_index"][0] = np.random.randint(low = 0, high = number_of_non_terminal_states, dtype = np.int64) # randomly choose an initial state from all non-terminal states

    # Get initial action
    episode_log["action_index"][0] = np.random.choice(a = max_number_of_actions, p = policy[episode_log["state_index"][0], :])

    return maximum_episode_length, episode_log

In [379]:
# This function selects a policy greedily from the state-action-value function
def greedy_policy_from_state_action_function(state_action_value_function, state_index, policy):
    # Save max state-action value and find the number of actions that have the same max state-action value
    max_action_value = np.max(a = state_action_value_function[state_index, :])
    max_action_count = np.count_nonzero(a = state_action_value_function[state_index, :] == max_action_value)
    
    # Apportion policy probability across ties equally for state-action pairs that have the same value and zero otherwise
    max_policy_apportioned_probability_per_action = 1.0 / max_action_count
    policy[state_index, :] = np.where(state_action_value_function[state_index, :] == max_action_value, max_policy_apportioned_probability_per_action, 0.0)
    
    return policy

In [380]:
# This function loops through episodes and updates the policy
def loop_through_episode(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function, policy, alpha, discounting_factor_gamma, maximum_episode_length, max_timestep, episode_log, n_steps):
    # Loop through episode steps until termination
    for t in range(0, maximum_episode_length):
        # Spend a little memory to save computation time
        t_mod_n_plus_1 = t % (n_steps + 1);
        t_plus_1_mod_n_plus_1 = (t + 1) % (n_steps + 1);
        
        if t < max_timestep:
            # Get reward
            successor_state_transition_index = np.random.choice(a = number_of_state_action_successor_states[episode_log["state_index"][t_mod_n_plus_1], episode_log["action_index"][t_mod_n_plus_1]], p = state_action_successor_state_transition_probabilities[episode_log["state_index"][t_mod_n_plus_1], episode_log["action_index"][t_mod_n_plus_1], :])

            episode_log["reward"][t_plus_1_mod_n_plus_1] = state_action_successor_state_rewards[episode_log["state_index"][t_mod_n_plus_1], episode_log["action_index"][t_mod_n_plus_1], successor_state_transition_index]

            # Get next state
            episode_log["state_index"][t_plus_1_mod_n_plus_1] = state_action_successor_state_indices[episode_log["state_index"][t_mod_n_plus_1], episode_log["action_index"][t_mod_n_plus_1], successor_state_transition_index]

            # Check to see if we actioned into a terminal state
            if episode_log["state_index"][t_plus_1_mod_n_plus_1] >= number_of_non_terminal_states:
                max_timestep = t + 1
            else:
                # Get next action
                episode_log["action_index"][t_plus_1_mod_n_plus_1] = np.random.randint(low = 0, high = max_number_of_actions, dtype = np.int64) # randomly choose next action from next state
                
        tau = t - n_steps + 1 # tau is the time whose estimate is being updated
        
        if tau >= 0:
            # Calculate expected return
            if t + 1 >= max_timestep:
                # Calculate expected return
                expected_return = episode_log["reward"][max_timestep % (n_steps + 1)]
            else:
                # Calculate expected state value function from policy
                state_value_function_expected_value_on_policy = np.sum(a = policy[episode_log["state_index"][t_plus_1_mod_n_plus_1], :] * state_action_value_function[episode_log["state_index"][t_plus_1_mod_n_plus_1], :])

                # Calculate expected return
                expected_return = episode_log["reward"][t_plus_1_mod_n_plus_1] + discounting_factor_gamma * state_value_function_expected_value_on_policy

            for k in range(min(t, max_timestep - 1), tau, -1):
                # Spend a little memory to save computation time
                k_mod_n_plus_1 = k % (n_steps + 1)
                
                # Calculate expected state value function from policy, however without including kth chosen action
                not_action_taken_mask = np.arange(max_number_of_actions) != episode_log["action_index"][k_mod_n_plus_1]
                not_action_taken_policy = np.extract(condition = not_action_taken_mask, arr = policy[episode_log["state_index"][k_mod_n_plus_1], :])
                not_action_taken_state_action_value = np.extract(condition = not_action_taken_mask, arr = state_action_value_function[episode_log["state_index"][k_mod_n_plus_1], :])
                
                state_value_function_expected_value_on_policy = np.sum(a = not_action_taken_policy * not_action_taken_state_action_value)

            # Spend a little memory to save computation time
            tau_mod_n_plus_1 = tau % (n_steps + 1)

            # Calculate state-action-function at tau timestep
            state_action_value_function[episode_log["state_index"][tau_mod_n_plus_1], episode_log["action_index"][tau_mod_n_plus_1]] += alpha * (expected_return - state_action_value_function[episode_log["state_index"][tau_mod_n_plus_1], episode_log["action_index"][tau_mod_n_plus_1]])

            # Choose policy for chosen state by epsilon-greedy choosing from the state-action-value function
            policy = greedy_policy_from_state_action_function(state_action_value_function, episode_log["state_index"][tau_mod_n_plus_1], policy)

        if tau == max_timestep - 1:
            break # break episode step loop, move on to next episode

    return state_action_value_function, policy

In [381]:
def off_policy_n_step_bootstrapping_tree_backup(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function, policy, alpha, discounting_factor_gamma, maximum_episode_length, episode_log, n_steps):
    for episode in range(0, number_of_episodes):
        # Initialize episode to get initial state and action
        max_timestep, episode_log = initialize_epsiode(number_of_non_terminal_states, max_number_of_actions, maximum_episode_length, policy, episode_log)

        # Loop through episode and update the policy
        state_action_value_function, policy = loop_through_episode(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function, policy, alpha, discounting_factor_gamma, maximum_episode_length, max_timestep, episode_log, n_steps)
    
    return state_action_value_function, policy

# Run algorithm

In [382]:
# Print initial arrays
print("\nInitial state-action value function")
print(state_action_value_function)

print("\nInitial policy")
print(policy)

# Run on policy n-step bootstrapping sarsa
state_action_value_function, policy = off_policy_n_step_bootstrapping_tree_backup(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, state_action_value_function, policy, alpha, discounting_factor_gamma, maximum_episode_length, episode_log, n_steps)

# Print final results
print("\nFinal state-action value function")
print(state_action_value_function)

print("\nFinal policy")
print(policy)


Initial state-action value function
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function
[[-2.99774574 -2.44179948 -1.         -2.48279976]
 [-3.13165697 -2.90680095 -2.31512945 -3.00587791]
 [-3.32875676 -3.37928397 -2.70049158 -3.15115737]
 [-2.9657913  -1.         -2.45394261 -3.18192262]
 [-3.11894451 -2.66316626 -2.32846724 -3.03710149]
 [-3.05239319 -3.07262312 -2.99328998 -2.73403838]
 [-3.06047893 -3.30568351 -2.98893872 -2.6