In [514]:
import numpy as np

# Create environment

In [41]:
number_of_states = 16
number_of_terminal_states = 2
number_of_non_terminal_states = number_of_states - number_of_terminal_states

In [42]:
max_number_of_actions = 4

In [43]:
number_of_actions_per_non_terminal_state = np.repeat(
    a=max_number_of_actions, repeats=number_of_non_terminal_states)

In [44]:
number_of_state_action_successor_states = np.repeat(
    a=1, repeats=number_of_states * max_number_of_actions)

In [45]:
number_of_state_action_successor_states = np.reshape(
    a=number_of_state_action_successor_states,
    newshape=(number_of_states, max_number_of_actions))

In [46]:
state_action_successor_state_indices = np.array(
    object=[1, 0, 14, 4,
            2, 1, 0, 5,
            2, 2, 1, 6,
            4, 14, 3, 7,
            5, 0, 3, 8,
            6, 1, 4, 9,
            6, 2, 5, 10,
            8, 3, 7, 11,
            9, 4, 7, 12,
            10, 5, 8, 13,
            10, 6, 9, 15,
            12, 7, 11, 11,
            13, 8, 11, 12,
            15, 9, 12, 13],
    dtype=np.int64)

In [47]:
state_action_successor_state_transition_probabilities = np.repeat(
    a=1.0, repeats=number_of_non_terminal_states * max_number_of_actions * 1)

In [48]:
state_action_successor_state_rewards = np.repeat(
    a=-1.0, repeats=number_of_non_terminal_states * max_number_of_actions * 1)

In [49]:
state_action_successor_state_indices = np.reshape(
    a=state_action_successor_state_indices,
    newshape=(number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_transition_probabilities = np.reshape(
    a=state_action_successor_state_transition_probabilities,
    newshape=(number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_rewards = np.reshape(
    a=state_action_successor_state_rewards,
    newshape=(number_of_non_terminal_states, max_number_of_actions, 1))

# Create value function and policy arrays

In [50]:
state_value_function = np.zeros(shape=number_of_states, dtype=np.float64)
state_action_value_function = np.zeros(
    shape=(number_of_non_terminal_states, max_number_of_actions),
    dtype=np.float64)

In [51]:
policy = np.repeat(
    a=1.0 / max_number_of_actions,
    repeats=number_of_non_terminal_states * max_number_of_actions)

In [52]:
policy = np.reshape(
    a=policy,
    newshape=(number_of_non_terminal_states, max_number_of_actions))

# Set hyperparameters

In [527]:
discounting_factor_gamma = 1.0
convergence_threshold = 0.001
maximum_number_of_sweeps = 30
maximum_number_of_policy_evaluations = 20

# Create algorithm

In [528]:
# This function evaluates the value functions given the current policy
def policy_evaluation(
    number_of_non_terminal_states,
    state_action_successor_state_indices,
    state_action_successor_state_transition_probabilities,
    state_action_successor_state_rewards,
    policy,
    convergence_threshold,
    discounting_factor_gamma,
    maximum_number_of_policy_evaluations,
    state_value_function,
    state_action_value_function):
    """Evaluates current policy.
    
    Args:
        number_of_non_terminal_states: int, number of non terminal states.
        state_action_successor_state_indices: array[int], state indices of new
            state s' of taking action a from state s.
        state_action_successor_state_transition_probabilities: array[float],
            transition probability to go from state s to s' by taking action a.
        state_action_successor_state_rewards: array[float], reward from new
            state s' from state s by taking action a.
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
        convergence_threshold: float, minimum maximum change across all value
            function updates.
        discounting_factor_gamma: float, 0 <= gamma <= 1, amount to discount
            future rewards.
        maximum_number_of_policy_evaluations: int, max number of iterations.
        state_value_function: array[float], keeps track of the estimated
            value of each state V(s).
        state_action_value_function: array[float], keeps track of the estimated
            value of each state-action pair Q(s, a).
    Returns:
        state_value_function: array, estimate of state value function V(s).
        state_action_value_function: array, estimate of state-action value
            function Q(s, a).
    """
    delta = np.finfo(np.float64).max
    number_of_policy_evaluations = 0

    while (delta >= convergence_threshold and
           number_of_policy_evaluations < maximum_number_of_policy_evaluations):
        for i in range(0, number_of_non_terminal_states):
            # Cache state-value function for state i
            temp_state_value_function = state_value_function[i]

            # Update state-action value function based on successor states,
            # transition probabilities, and rewards
            state_action_value_function[i, :] = np.squeeze(
                a=np.where(
                    state_action_successor_state_indices[i, :, :] == i,
                    state_action_successor_state_transition_probabilities[i, :, :] * (state_action_successor_state_rewards[i, :, :] + discounting_factor_gamma * temp_state_value_function),
                    state_action_successor_state_transition_probabilities[i, :, :] * (state_action_successor_state_rewards[i, :, :] + discounting_factor_gamma * state_value_function[state_action_successor_state_indices[i, :, :]])
                ),
                axis=1)
            
            # Update state value function based on current policy
            state_value_function[i] = np.sum(
                a=policy[i, :] * state_action_value_function[i, :])

            # Update delta for convergence criteria to break while loop and
            # update policy
            delta = np.max(
                a=(delta,
                   np.abs(temp_state_value_function - state_value_function[i])))
            
        number_of_policy_evaluations += 1
        
    return state_value_function, state_action_value_function

In [529]:
# This function greedily updates the policy based on the current value function
def policy_improvement(
    number_of_non_terminal_states,
    state_action_successor_state_indices,
    state_action_successor_state_transition_probabilities,
    state_action_successor_state_rewards,
    policy,
    old_policy,
    discounting_factor_gamma,
    state_value_function):
    """Improves policy based on new value estimates.
    
    Args:
        number_of_non_terminal_states: int, number of non terminal states.
        state_action_successor_state_indices: array[int], state indices of new
            state s' of taking action a from state s.
        state_action_successor_state_transition_probabilities: array[float],
            transition probability to go from state s to s' by taking action a.
        state_action_successor_state_rewards: array[float], reward from new
            state s' from state s by taking action a.
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
        old_policy: array[float], previously learned stochastic policy of which
            action a to take in state s.
        discounting_factor_gamma: float, 0 <= gamma <= 1, amount to discount
            future rewards.
        state_value_function: array[float], keeps track of the estimated
            value of each state V(s).
    Returns:
        policy_stable: bool, if policy is stable or not (hasn't changed).
        policy: array, learned stochastic policy of which action a to take in
            state s.
    """
    for i in range(0, number_of_non_terminal_states):
        # Cache policy for comparison later
        old_policy = np.copy(a=policy[i, :])

        # Update policy greedily from state-value function
        policy[i, :] = np.squeeze(
            a=state_action_successor_state_transition_probabilities[i, :, :] * (state_action_successor_state_rewards[i, :, :] + discounting_factor_gamma * state_value_function[state_action_successor_state_indices[i, :, :]]),
            axis = 1)

        # Save max policy value and find the number of actions that have
        # the same max policy value
        max_policy_value = np.max(a = policy[i, :])
        max_policy_count = np.count_nonzero(
            a=policy[i, :] == max_policy_value)
            
        # Apportion policy probability across ties equally for state-action
        # pairs that have the same value and zero otherwise
        policy[i, :] = np.where(
            policy[i, :] == max_policy_value,
            1.0 / max_policy_count,
            0.0)
        
        # If policy has changed from old policy
        policy_stable = np.array_equal(policy[i, :], old_policy)

    return policy_stable, policy

In [530]:
def policy_iteration(
    number_of_non_terminal_states,
    state_action_successor_state_indices,
    state_action_successor_state_transition_probabilities,
    state_action_successor_state_rewards,
    policy,
    old_policy,
    convergence_threshold,
    discounting_factor_gamma,
    maximum_number_of_policy_evaluations,
    state_value_function,
    state_action_value_function,
    maximum_number_of_sweeps):
    """Iterates policy through evaluation and improvement stages.

    Args:
        number_of_non_terminal_states: int, number of non terminal states.
        state_action_successor_state_indices: array[int], state indices of new
            state s' of taking action a from state s.
        state_action_successor_state_transition_probabilities: array[float],
            transition probability to go from state s to s' by taking action a.
        state_action_successor_state_rewards: array[float], reward from new
            state s' from state s by taking action a.
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
        old_policy: array[float], previously learned stochastic policy of which
            action a to take in state s.
        convergence_threshold: float, minimum maximum change across all value
            function updates.
        discounting_factor_gamma: float, 0 <= gamma <= 1, amount to discount
            future rewards.
        maximum_number_of_policy_evaluations: int, max number of iterations.
        state_value_function: array[float], keeps track of the estimated
            value of each state V(s).
        state_action_value_function: array[float], keeps track of the estimated
            value of each state-action pair Q(s, a).
        maximum_number_of_sweeps: int, max number of outer loop sweeps.
    Returns:
        state_value_function: array, estimate of state value function V(s).
        state_action_value_function: array, estimate of state-action value
            function Q(s, a).
        policy: array, learned stochastic policy of which action a to take in
            state s.
    """
    policy_stable = False
    number_of_sweeps = 0
    
    while (policy_stable == False and
           number_of_sweeps < maximum_number_of_sweeps):
        print("State value function before sweep {}".format(number_of_sweeps))
        print(state_value_function)
        print("\n")

        print("State-action value function before sweep {}".format(
            number_of_sweeps))
        print(state_action_value_function)
        print("\n")

        print("Policy before sweep {}".format(number_of_sweeps))
        print(policy)
        print("\n")

        # Policy evaluation
        state_value_function, state_action_value_function = policy_evaluation(
            number_of_non_terminal_states,
            state_action_successor_state_indices,
            state_action_successor_state_transition_probabilities,
            state_action_successor_state_rewards,
            policy,
            convergence_threshold,
            discounting_factor_gamma,
            maximum_number_of_policy_evaluations,
            state_value_function,
            state_action_value_function)

        # Policy improvement
        policy_stable, policy = policy_improvement(
            number_of_non_terminal_states,
            state_action_successor_state_indices,
            state_action_successor_state_transition_probabilities,
            state_action_successor_state_rewards,
            policy,
            old_policy,
            discounting_factor_gamma,
            state_value_function)

        print("policy_stable = {} at sweep {}\n".format(
            policy_stable, number_of_sweeps))

        number_of_sweeps += 1
        
    return state_value_function, state_action_value_function, policy

# Run algorithm

In [531]:
# Run policy iteration
state_value_function, state_action_value_function, policy = policy_iteration(
    number_of_non_terminal_states,
    state_action_successor_state_indices,
    state_action_successor_state_transition_probabilities,
    state_action_successor_state_rewards,
    policy,
    old_policy,
    convergence_threshold,
    discounting_factor_gamma,
    maximum_number_of_policy_evaluations,
    state_value_function,
    state_action_value_function,
    maximum_number_of_sweeps)

# Print final results
print("\nFinal state value function")
print(state_value_function)
print("\nFinal state-action value function")
print(state_action_value_function)
print("\nFinal policy")
print(policy)

State value function before sweep 0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


State-action value function before sweep 0
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


Policy before sweep 0
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


policy_stable = False at sweep 0

State value function before sweep 1
[-11.42591538 -16.29940807 -17.92681232 -11.42591538 -14.84125831
 -16.57033147 -16.60954761 -16.29940807 -16.57033147 -15.10598674
 -11.83929409 -17.92681232 -16.60954761 -11.83929409   0.
   0.        ]


State-action value function befo