In [671]:
import numpy as np

# Create environment

In [41]:
num_states = 16
num_terminal_states = 2
num_non_terminal_states = num_states - num_terminal_states

In [42]:
max_num_actions = 4

In [43]:
num_actions_per_non_terminal_state = np.repeat(
    a=max_num_actions, repeats=num_non_terminal_states)

In [44]:
num_state_action_successor_states = np.repeat(
    a=1, repeats=num_states * max_num_actions)

In [45]:
num_state_action_successor_states = np.reshape(
    a=num_state_action_successor_states,
    newshape=(num_states, max_num_actions))

In [46]:
sp_idx = np.array(
    object=[1, 0, 14, 4,
            2, 1, 0, 5,
            2, 2, 1, 6,
            4, 14, 3, 7,
            5, 0, 3, 8,
            6, 1, 4, 9,
            6, 2, 5, 10,
            8, 3, 7, 11,
            9, 4, 7, 12,
            10, 5, 8, 13,
            10, 6, 9, 15,
            12, 7, 11, 11,
            13, 8, 11, 12,
            15, 9, 12, 13],
    dtype=np.int64)

In [47]:
p = np.repeat(a=1.0, repeats=num_non_terminal_states * max_num_actions * 1)

In [48]:
r = np.repeat(a=-1.0, repeats=num_non_terminal_states * max_num_actions * 1)

In [49]:
sp_idx = np.reshape(
    a=sp_idx,
    newshape=(num_non_terminal_states, max_num_actions, 1))
p = np.reshape(
    a=p,
    newshape=(num_non_terminal_states, max_num_actions, 1))
r = np.reshape(
    a=r,
    newshape=(num_non_terminal_states, max_num_actions, 1))

# Set hyperparameters

In [681]:
num_episodes = 10000
maximum_episode_length = 200
gamma = 1.0

# Create value function and policy arrays

In [682]:
# Create epsiode log
episode_log = {
    "s_idx": np.repeat(a=-1, repeats=maximum_episode_length),
    "a_idx": np.repeat(a=-1, repeats=maximum_episode_length),
    "reward": np.repeat(a=0.0, repeats=maximum_episode_length)
}

In [683]:
# Get minimum reward since GLIE MC can have some problems based on
# value function intialization
minimum_reward = np.min(r)

if minimum_reward < 0:
    q_initializer = 2.0 * minimum_reward
else:
    q_initializer = 0.0

In [684]:
q = np.repeat(a=q_initializer, repeats=num_states * max_num_actions)
q = np.reshape(a=q, newshape=(num_states, max_num_actions))

In [685]:
weights_cum_sum = np.zeros(
    shape=(num_states, max_num_actions), dtype=np.float64)

In [686]:
target_policy = np.repeat(
    a=1.0 / max_num_actions, repeats=num_non_terminal_states * max_num_actions)
target_policy = np.reshape(
    a=target_policy, newshape=(num_non_terminal_states, max_num_actions))

In [687]:
behavior_policy = np.repeat(
    a=1.0 / max_num_actions, repeats=num_non_terminal_states * max_num_actions)
behavior_policy = np.reshape(
    a=behavior_policy, newshape=(num_non_terminal_states, max_num_actions))

# Create algorithm

In [688]:
# Set random seed so that everything is reproducible
np.random.seed(seed=0)

In [689]:
# This function generates episodes
def generate_epsiode(
        num_non_terminal_states,
        max_num_actions,
        num_state_action_successor_states,
        sp_idx,
        p,
        r,
        maximum_episode_length,
        behavior_policy,
        episode_log):
    step_count = 0

    # Initial state
    s_idx = np.random.randint(
        low=0, high=num_non_terminal_states, dtype=np.int64)

    # Now repeat
    while step_count < maximum_episode_length:
        # Get state
        episode_log["s_idx"][step_count] = s_idx

        # Get action
        a_idx = np.random.choice(
            a=max_num_actions, p=behavior_policy[s_idx, :])
        episode_log["a_idx"][step_count] = a_idx

        # Get reward
        successor_state_transition_idx = np.random.choice(
            a=num_state_action_successor_states[s_idx, a_idx],
            p=p[s_idx, a_idx, :])

        episode_log["reward"][step_count] = r[s_idx,
                                              a_idx,
                                              successor_state_transition_idx]

        # Get next state
        s_idx = sp_idx[s_idx,
                           a_idx,
                           successor_state_transition_idx]

        # Increment step count
        step_count += 1

        # Check to see if we actioned into a terminal state
        if s_idx >= num_non_terminal_states:
            break  # episode terminated since we ended up in a terminal state

    return step_count, episode_log

In [690]:
# This function selects a policy greedily from the state-action-value function
def greedy_policy_from_state_action_function(q, s_idx, policy):
    # Save max state-action value and find the number of actions that have the
    # same max state-action value
    max_action_value = np.max(a=q[s_idx, :])
    max_action_count = np.count_nonzero(
        a=q[s_idx, :] == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs
    # that have the same value and zero otherwise
    max_policy_prob_per_action = 1.0 / max_action_count
    policy[s_idx, :] = np.where(
        q[s_idx, :] == max_action_value,
        max_policy_prob_per_action,
        0.0)

    return max_policy_prob_per_action, policy

In [691]:
# This function loops through episodes in reverse order and updates the target
# policy
def loop_through_episode_in_reverse(
        num_non_terminal_states,
        max_num_actions,
        q,
        weights_cum_sum,
        target_policy,
        behavior_policy,
        gamma,
        episode_log,
        episode_length):
    expected_return = 0.0
    weight = 1.0

    # Loop through episode steps in reverse order
    for t in range(episode_length - 1, -1, -1):
        s_idx = episode_log["s_idx"][t]
        a_idx = episode_log["a_idx"][t]

        # Calculate expected return
        expected_return = gamma * expected_return + episode_log["reward"][t]

        # Keep track of weight so that we can incrementally calculate average
        weights_cum_sum[s_idx, a_idx] += weight

        # Update state-action value function
        delta = expected_return - q[s_idx, a_idx]
        weight_ratio = weight / weights_cum_sum[s_idx, a_idx]
        q[s_idx, a_idx] += weight_ratio * delta

        # Choose policy for chosen state by greedily choosing from the
        # state-action-value function
        (max_policy_prob_per_action,
         target_policy) = greedy_policy_from_state_action_function(
            q, s_idx, target_policy)

        # Check to see if behavior action from episode is the same as
        # target action
        if target_policy[s_idx, a_idx] != max_policy_prob_per_action:
            break  # break episode step loop, move on to next episode

        # Update weight based on behavior policy
        weight /= behavior_policy[s_idx, a_idx]

    return q, target_policy, weights_cum_sum

In [692]:
def off_policy_monte_carlo_control(
        num_non_terminal_states,
        max_num_actions,
        q,
        weights_cum_sum,
        target_policy,
        behavior_policy,
        gamma,
        episode_log,
        episode_length):
    for episode in range(0, num_episodes):
        # Generate episode and get the length
        episode_length, episode_log = generate_epsiode(
            num_non_terminal_states,
            max_num_actions,
            num_state_action_successor_states,
            sp_idx,
            p,
            r,
            maximum_episode_length,
            behavior_policy,
            episode_log)

        # Loop through episode in reverse order and update the target policy
        q, target_policy, weights_cum_sum = loop_through_episode_in_reverse(
            num_non_terminal_states,
            max_num_actions,
            q,
            weights_cum_sum,
            target_policy,
            behavior_policy,
            gamma,
            episode_log,
            episode_length)

    return q, target_policy

# Run algorithm

In [693]:
# Print initial arrays
print("\nInitial state-action value function")
print(q)

print("\nInitial policy")
print(target_policy)

# Run off policy monte carlo control
q, target_policy = off_policy_monte_carlo_control(
    num_non_terminal_states,
    max_num_actions,
    q,
    weights_cum_sum,
    target_policy,
    behavior_policy,
    gamma,
    episode_log,
    episode_length)

# Print final results
print("\nFinal state-action value function")
print(q)
print("\nFinal policy")
print(target_policy)


Initial state-action value function
[[-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -2.]]

Initial policy
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Final state-action value function
[[-3. -2. -1. -3.]
 [-2. -3. -2. -4.]
 [-2. -2. -3. -3.]
 [-3. -1. -2. -3.]
 [-4. -2. -2. -4.]
 [-3. -3. -3. -3.]
 [-3. -2. -4. -2.]
 [-4. -2. -3. -2.]
 [-3. -3. -3. -3.]
 [-2. -4. -4. -2.]
 [-2. -3. -3. -1.]
 [-3. -3. -2. -2.]
 [-2. -4. -2. -3.]
 [-1. -3. -3. -2.]
 [-2. -2. -2. -2.]