In [671]:
import numpy as np

# Create environment

In [672]:
number_of_states = 16
number_of_terminal_states = 2
number_of_non_terminal_states = number_of_states - number_of_terminal_states

In [673]:
max_number_of_actions = 4

In [674]:
number_of_actions_per_non_terminal_state = np.repeat(a = max_number_of_actions, repeats = number_of_non_terminal_states)

In [675]:
number_of_state_action_successor_states = np.repeat(a = 1, repeats = number_of_states * max_number_of_actions)

In [676]:
number_of_state_action_successor_states = np.reshape(a = number_of_state_action_successor_states, newshape = (number_of_states, max_number_of_actions))

In [677]:
state_action_successor_state_indices = np.array([1, 0, 14, 4, 2, 1, 0, 5, 2, 2, 1, 6, 4, 14, 3, 7, 5, 0, 3, 8, 6, 1, 4, 9, 6, 2, 5, 10, 8, 3, 7, 11, 9, 4, 7, 12, 10, 5, 8, 13, 10, 6, 9, 15, 12, 7, 11, 11, 13, 8, 11, 12, 15, 9, 12, 13], dtype = np.int64)

In [678]:
state_action_successor_state_transition_probabilities = np.repeat(a = 1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1)

In [679]:
state_action_successor_state_rewards = np.repeat(a = -1.0, repeats = number_of_non_terminal_states * max_number_of_actions * 1)

In [680]:
state_action_successor_state_indices = np.reshape(a = state_action_successor_state_indices, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_transition_probabilities = np.reshape(a = state_action_successor_state_transition_probabilities, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_rewards = np.reshape(a = state_action_successor_state_rewards, newshape = (number_of_non_terminal_states, max_number_of_actions, 1))

# Set hyperparameters

In [681]:
number_of_episodes = 10000
maximum_episode_length = 200
discounting_factor_gamma = 1.0

# Create value function and policy arrays

In [682]:
# Create epsiode log
episode_log = {"state_index": np.repeat(a = -1, repeats = maximum_episode_length), 
               "action_index": np.repeat(a = -1, repeats = maximum_episode_length), 
               "reward": np.repeat(a = 0.0, repeats = maximum_episode_length)}

In [683]:
# Get minimum reward since GLIE MC can have some problems based on value funciton intialization
minimum_reward = np.min(state_action_successor_state_rewards)

if minimum_reward < 0:
    state_action_value_function_initializer = 2.0 * minimum_reward
else:
    state_action_value_function_initializer = 0.0

In [684]:
state_action_value_function = np.repeat(a = state_action_value_function_initializer, repeats = number_of_states * max_number_of_actions)
state_action_value_function = np.reshape(a = state_action_value_function, newshape = (number_of_states, max_number_of_actions))

In [685]:
weights_cumulative_sum = np.zeros(shape = (number_of_states, max_number_of_actions), dtype = np.float64)

In [686]:
target_policy = np.repeat(a = 1.0 / max_number_of_actions, repeats = number_of_non_terminal_states * max_number_of_actions)
target_policy = np.reshape(a = target_policy, newshape = (number_of_non_terminal_states, max_number_of_actions))

In [687]:
behavior_policy = np.repeat(a = 1.0 / max_number_of_actions, repeats = number_of_non_terminal_states * max_number_of_actions)
behavior_policy = np.reshape(a = behavior_policy, newshape = (number_of_non_terminal_states, max_number_of_actions))

# Create algorithm

In [688]:
# Set random seed so that everything is reproducible
np.random.seed(seed = 0)

In [689]:
# This function generates episodes
def generate_epsiode(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, maximum_episode_length, behavior_policy, episode_log):
    step_count = 0
    
    # Initial state
    state_index = np.random.randint(low = 0, high = number_of_non_terminal_states, dtype = np.int64)

    # Now repeat
    while step_count < maximum_episode_length:
        # Get state
        episode_log["state_index"][step_count] = state_index
        
        # Get action
        action_index = np.random.choice(a = max_number_of_actions, p = behavior_policy[state_index, :])
        episode_log["action_index"][step_count] = action_index
        
        # Get reward
        successor_state_transition_index = np.random.choice(a = number_of_state_action_successor_states[state_index, action_index], p = state_action_successor_state_transition_probabilities[state_index, action_index, :])

        episode_log["reward"][step_count] = state_action_successor_state_rewards[state_index, action_index, successor_state_transition_index]

        # Get next state
        state_index = state_action_successor_state_indices[state_index, action_index, successor_state_transition_index]
        
        # Increment step count
        step_count += 1

        # Check to see if we actioned into a terminal state
        if state_index >= number_of_non_terminal_states:
            break # episode terminated since we ended up in a terminal state

    return step_count, episode_log

In [690]:
# This function selects a policy greedily from the state-action-value function
def greedy_policy_from_state_action_function(state_action_value_function, state_index, policy):
    # Save max state-action value and find the number of actions that have the same max state-action value
    max_action_value = np.max(a = state_action_value_function[state_index, :])
    max_action_count = np.count_nonzero(a = state_action_value_function[state_index, :] == max_action_value)
    
    # Apportion policy probability across ties equally for state-action pairs that have the same value and zero otherwise
    max_policy_apportioned_probability_per_action = 1.0 / max_action_count
    policy[state_index, :] = np.where(state_action_value_function[state_index, :] == max_action_value, max_policy_apportioned_probability_per_action, 0.0)
    
    return max_policy_apportioned_probability_per_action, policy

In [691]:
# This function loops through episodes in reverse order and updates the target policy
def loop_through_episode_in_reverse(number_of_non_terminal_states, max_number_of_actions, state_action_value_function, weights_cumulative_sum, target_policy, behavior_policy, discounting_factor_gamma, episode_log, episode_length):
    expected_return = 0.0
    weight = 1.0
    
    # Loop through episode steps in reverse order
    for t in range(episode_length - 1, -1, -1):
        state_index = episode_log["state_index"][t]
        action_index = episode_log["action_index"][t]
        
        # Calculate expected return
        expected_return = discounting_factor_gamma * expected_return + episode_log["reward"][t]

        # Keep track of weight so that we can incrementally calculate average
        weights_cumulative_sum[state_index, action_index] += weight

        # Update state-action value function
        state_action_value_function[state_index, action_index] += weight / weights_cumulative_sum[state_index, action_index] * (expected_return - state_action_value_function[state_index, action_index])

        # Choose policy for chosen state by greedily choosing from the state-action-value function
        max_policy_apportioned_probability_per_action, target_policy = greedy_policy_from_state_action_function(state_action_value_function, state_index, target_policy)

        # Check to see if behavior action from episode is the same as target action
        if target_policy[state_index, action_index] != max_policy_apportioned_probability_per_action:
            break # break episode step loop, move on to next episode

        # Update weight based on behavior policy
        weight /= behavior_policy[state_index, action_index];

    return state_action_value_function, target_policy, weights_cumulative_sum

In [692]:
def off_policy_monte_carlo_control(number_of_non_terminal_states, max_number_of_actions, state_action_value_function, weights_cumulative_sum, target_policy, behavior_policy, discounting_factor_gamma, episode_log, episode_length):
    for episode in range(0, number_of_episodes):
        # Generate episode and get the length
        episode_length, episode_log = generate_epsiode(number_of_non_terminal_states, max_number_of_actions, number_of_state_action_successor_states, state_action_successor_state_indices, state_action_successor_state_transition_probabilities, state_action_successor_state_rewards, maximum_episode_length, behavior_policy, episode_log)

        # Loop through episode in reverse order and update the target policy
        state_action_value_function, target_policy, weights_cumulative_sum = loop_through_episode_in_reverse(number_of_non_terminal_states, max_number_of_actions, state_action_value_function, weights_cumulative_sum, target_policy, behavior_policy, discounting_factor_gamma, episode_log, episode_length)
    
    return state_action_value_function, target_policy

# Run algorithm

In [693]:
# Print initial arrays
print("\nInitial state-action value function")
print(state_action_value_function)

print("\nInitial policy")
print(target_policy)

# Run off policy monte carlo control
state_action_value_function, target_policy = off_policy_monte_carlo_control(number_of_non_terminal_states, max_number_of_actions, state_action_value_function, weights_cumulative_sum, target_policy, behavior_policy, discounting_factor_gamma, episode_log, episode_length)

# Print final results
print("\nFinal state-action value function")
print(state_action_value_function)
print("\nFinal policy")
print(target_policy)


Initial state-action value function
[[-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function
[[-3. -2. -1. -3.]
 [-2. -3. -2. -4.]
 [-2. -2. -3. -3.]
 [-3. -1. -2. -3.]
 [-4. -2. -2. -4.]
 [-3. -3. -3. -3.]
 [-3. -2. -4. -2.]
 [-4. -2. -3. -2.]
 [-3. -3. -3. -3.]
 [-2. -4. -4. -2.]
 [-2. -3. -3. -1.]
 [-3. -3. -2. -2.]
 [-2. -4. -2. -3.]
 [-1. -3. -3. -2.]
 [-2. -2. -2. -2.]