# Dynamic Programming: Policy Iteration, Stochastic

In [1]:
import numpy as np

## Create environment

In [2]:
def create_environment_states():
    """Creates environment states.

    Returns:
        num_states: int, number of states.
        num_terminal_states: int, number of terminal states.
        num_non_terminal_states: int, number of non terminal states.
    """
    num_states = 16
    num_terminal_states = 2
    num_non_terminal_states = num_states - num_terminal_states

    return num_states, num_terminal_states, num_non_terminal_states

In [3]:
def create_environment_actions(num_non_terminal_states):
    """Creates environment actions.

    Args:
        num_non_terminal_states: int, number of non terminal states.

    Returns:
        max_num_actions: int, max number of actions possible.
        num_actions_per_non_terminal_state: array[int], number of actions per
            non terminal state.
    """
    max_num_actions = 4

    num_actions_per_non_terminal_state = np.repeat(
        a=max_num_actions, repeats=num_non_terminal_states)

    return max_num_actions, num_actions_per_non_terminal_state

In [4]:
def create_environment_successor_counts(num_states, max_num_actions):
    """Creates environment successor counts.

    Args:
        num_states: int, number of states.
        max_num_actions: int, max number of actions possible.
    Returns:
        num_state_action_successor_states: array[int], number of successor
            states s' that can be reached from state s by taking action a.
    """
    num_state_action_successor_states = np.repeat(
        a=1, repeats=num_states * max_num_actions)

    num_state_action_successor_states = np.reshape(
        a=num_state_action_successor_states,
        newshape=(num_states, max_num_actions))

    return num_state_action_successor_states

In [5]:
def create_environment_successor_arrays(
        num_non_terminal_states, max_num_actions):
    """Creates environment successor arrays.

    Args:
        num_non_terminal_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
    Returns:
        sp_idx: array[int], state indices of new state s' of taking action a
            from state s.
        p: array[float], transition probability to go from state s to s' by
            taking action a.
        r: array[float], reward from new state s' from state s by taking
            action a.
    """
    sp_idx = np.array(
        object=[1, 0, 14, 4,
                2, 1, 0, 5,
                2, 2, 1, 6,
                4, 14, 3, 7,
                5, 0, 3, 8,
                6, 1, 4, 9,
                6, 2, 5, 10,
                8, 3, 7, 11,
                9, 4, 7, 12,
                10, 5, 8, 13,
                10, 6, 9, 15,
                12, 7, 11, 11,
                13, 8, 11, 12,
                15, 9, 12, 13],
        dtype=np.int64)

    p = np.repeat(
        a=1.0, repeats=num_non_terminal_states * max_num_actions * 1)

    r = np.repeat(
        a=-1.0, repeats=num_non_terminal_states * max_num_actions * 1)

    sp_idx = np.reshape(
        a=sp_idx,
        newshape=(num_non_terminal_states, max_num_actions, 1))
    p = np.reshape(
        a=p,
        newshape=(num_non_terminal_states, max_num_actions, 1))
    r = np.reshape(
        a=r,
        newshape=(num_non_terminal_states, max_num_actions, 1))

    return sp_idx, p, r

In [6]:
def create_environment():
    """Creates environment.

    Returns:
        num_states: int, number of states.
        num_terminal_states: int, number of terminal states.
        num_non_terminal_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
        num_actions_per_non_terminal_state: array[int], number of actions per
            non terminal state.
        num_state_action_successor_states: array[int], number of successor
            states s' that can be reached from state s by taking action a.
        sp_idx: array[int], state indices of new state s' of taking action a
            from state s.
        p: array[float], transition probability to go from state s to s' by
            taking action a.
        r: array[float], reward from new state s' from state s by taking
            action a.
    """
    (num_states,
     num_terminal_states,
     num_non_terminal_states) = create_environment_states()

    (max_num_actions,
     num_actions_per_non_terminal_state) = create_environment_actions(
        num_non_terminal_states)

    num_state_action_successor_states = create_environment_successor_counts(
        num_states, max_num_actions)

    (sp_idx,
     p,
     r) = create_environment_successor_arrays(
        num_non_terminal_states, max_num_actions)

    return (num_states,
            num_terminal_states,
            num_non_terminal_states,
            max_num_actions,
            num_actions_per_non_terminal_state,
            num_state_action_successor_states,
            sp_idx,
            p,
            r)

## Set hyperparameters

In [7]:
def set_hyperparameters():
    """Sets hyperparameters.

    Returns:
        gamma: float, 0 <= gamma <= 1, amount to discount future reward.
        convergence_threshold: float, minimum maximum change across all value
            function updates.
        maximum_num_sweeps: int, max number of outer loop sweeps.
        maximum_num_policy_evaluations: int, max number of iterations.
    """
    gamma = 1.0
    convergence_threshold = 0.001
    maximum_num_sweeps = 30
    maximum_num_policy_evaluations = 20

    return (gamma,
            convergence_threshold,
            maximum_num_sweeps,
            maximum_num_policy_evaluations)

## Create value function and policy arrays

In [8]:
def create_value_function_arrays(num_states, num_non_terminal_states, max_num_actions):
    """Creates value function arrays.

    Args:
        num_states: int, number of states.
        num_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
    Returns:
        v: array, estimate of state value function V(s).
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
    """
    v = np.zeros(shape=num_states, dtype=np.float64)
    q = np.zeros(
        shape=(num_non_terminal_states, max_num_actions),
        dtype=np.float64)

    return v, q

In [9]:
def create_policy_arrays(num_non_terminal_states, max_num_actions):
    """Creates policy arrays.

    Args:
        num_non_terminal_states: int, number of non terminal states.
        max_num_actions: int, max number of actions possible.
    Returns:
        policy: array[float], learned stochastic policy of which
            action a to take in state s.
        old_policy: array[float], copy of policy to be used for comparison
            with learned policy, tracking changes over time.
    """
    policy = np.repeat(
        a=1.0 / max_num_actions,
        repeats=num_non_terminal_states * max_num_actions)

    policy = np.reshape(
        a=policy,
        newshape=(num_non_terminal_states, max_num_actions))
    
    old_policy = np.copy(a=policy)

    return policy, old_policy

## Create algorithm

In [10]:
def policy_evaluation(
        num_non_terminal_states,
        sp_idx,
        p,
        r,
        policy,
        convergence_threshold,
        gamma,
        maximum_num_policy_evaluations,
        v,
        q):
    """Evaluates current policy.

    Args:
        num_non_terminal_states: int, number of non terminal states.
        sp_idx: array[int], state indices of new state s' of taking action a
            from state s.
        p: array[float], transition probability to go from state s to s' by
            taking action a.
        r: array[float], reward from new state s' from state s by taking
            action a.
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
        convergence_threshold: float, minimum maximum change across all value
            function updates.
        gamma: float, 0 <= gamma <= 1, amount to discount future reward.
        maximum_num_policy_evaluations: int, max number of iterations.
        v: array[float], keeps track of the estimated value of each state V(s).
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
    Returns:
        v: array, estimate of state value function V(s).
        q: array, estimate of state-action value function Q(s, a).
    """
    delta = np.finfo(np.float64).max
    num_policy_evaluations = 0

    while (delta >= convergence_threshold and
           num_policy_evaluations < maximum_num_policy_evaluations):
        for i in range(0, num_non_terminal_states):
            # Cache state-value function for state i
            temp_v = v[i]

            # Update state-action value function based on successor states,
            # transition probabilities, and r
            q[i, :] = np.squeeze(
                a=np.where(
                    sp_idx[i, :, :] == i,
                    p[i, :, :] * (r[i, :, :] + gamma * temp_v),
                    p[i, :, :] * (r[i, :, :] + gamma * v[sp_idx[i, :, :]])
                ),
                axis=1)

            # Update state value function based on current policy
            v[i] = np.sum(
                a=policy[i, :] * q[i, :])

            # Update delta for convergence criteria to break while loop and
            # update policy
            delta = np.max(
                a=(delta,
                   np.abs(temp_v - v[i])))

        num_policy_evaluations += 1

    return v, q

In [11]:
def policy_improvement(
        num_non_terminal_states,
        sp_idx,
        p,
        r,
        policy,
        old_policy,
        gamma,
        v):
    """Improves policy greedily based on new value function estimates.

    Args:
        num_non_terminal_states: int, number of non terminal states.
        sp_idx: array[int], state indices of new state s' of taking action a
            from state s.
        p: array[float], transition probability to go from state s to s' by
            taking action a.
        r: array[float], reward from new state s' from state s by taking
            action a.
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
        old_policy: array[float], previously learned stochastic policy of which
            action a to take in state s.
        gamma: float, 0 <= gamma <= 1, amount to discount future reward.
        v: array[float], keeps track of the estimated value of each state V(s).
    Returns:
        policy_stable: bool, if policy is stable or not (hasn't changed).
        policy: array, learned stochastic policy of which action a to take in
            state s.
    """
    for i in range(0, num_non_terminal_states):
        # Cache policy for comparison later
        old_policy = np.copy(a=policy[i, :])

        # Update policy greedily from state-value function
        policy[i, :] = np.squeeze(
            a=p[i, :, :] * (r[i, :, :] + gamma * v[sp_idx[i, :, :]]),
            axis=1)

        # Save max policy value and find the number of actions that have
        # the same max policy value
        max_policy_value = np.max(a=policy[i, :])
        max_policy_count = np.count_nonzero(
            a=policy[i, :] == max_policy_value)

        # Apportion policy probability across ties equally for state-action
        # pairs that have the same value and zero otherwise
        policy[i, :] = np.where(
            policy[i, :] == max_policy_value,
            1.0 / max_policy_count,
            0.0)

        # If policy has changed from old policy
        policy_stable = np.array_equal(policy[i, :], old_policy)

    return policy_stable, policy

In [12]:
def policy_iteration(
        num_non_terminal_states,
        sp_idx,
        p,
        r,
        policy,
        old_policy,
        convergence_threshold,
        gamma,
        maximum_num_policy_evaluations,
        v,
        q,
        maximum_num_sweeps):
    """Iterates policy through evaluation and improvement stages.

    Args:
        num_non_terminal_states: int, number of non terminal states.
        sp_idx: array[int], state indices of new state s' of taking action a
            from state s.
        p: array[float], transition probability to go from state s to s' by
            taking action a.
        r: array[float], reward from new state s' from state s by taking
            action a.
        policy: array[float], learned stochastic policy of which action a to
            take in state s.
        old_policy: array[float], previously learned stochastic policy of which
            action a to take in state s.
        convergence_threshold: float, minimum maximum change across all value
            function updates.
        gamma: float, 0 <= gamma <= 1, amount to discount future reward.
        maximum_num_policy_evaluations: int, max number of iterations.
        v: array[float], keeps track of the estimated value of each state V(s).
        q: array[float], keeps track of the estimated value of each
            state-action pair Q(s, a).
        maximum_num_sweeps: int, max number of outer loop sweeps.
    Returns:
        v: array, estimate of state value function V(s).
        q: array, estimate of state-action value function Q(s, a).
        policy: array, learned stochastic policy of which action a to take in
            state s.
    """
    policy_stable = False
    num_sweeps = 0

    while (not policy_stable and num_sweeps < maximum_num_sweeps):
        print("State value function before sweep {}".format(num_sweeps))
        print(v)
        print("\n")

        print("State-action value function before sweep {}".format(
            num_sweeps))
        print(q)
        print("\n")

        print("Policy before sweep {}".format(num_sweeps))
        print(policy)
        print("\n")

        # Policy evaluation
        v, q = policy_evaluation(
            num_non_terminal_states,
            sp_idx,
            p,
            r,
            policy,
            convergence_threshold,
            gamma,
            maximum_num_policy_evaluations,
            v,
            q)

        # Policy improvement
        policy_stable, policy = policy_improvement(
            num_non_terminal_states,
            sp_idx,
            p,
            r,
            policy,
            old_policy,
            gamma,
            v)

        print("policy_stable = {} at sweep {}\n".format(
            policy_stable, num_sweeps))

        num_sweeps += 1

    return v, q, policy

## Run algorithm

In [13]:
def run_algorithm():
    """Runs the algorithm."""
    (num_states,
     num_terminal_states,
     num_non_terminal_states,
     max_num_actions,
     num_actions_per_non_terminal_state,
     num_state_action_successor_states,
     sp_idx,
     p,
     r) = create_environment()

    (gamma,
     convergence_threshold,
     maximum_num_sweeps,
     maximum_num_policy_evaluations) = set_hyperparameters()

    v, q = create_value_function_arrays(
        num_states, num_non_terminal_states, max_num_actions)

    policy, old_policy = create_policy_arrays(
        num_non_terminal_states, max_num_actions)

    # Run policy iteration
    v, q, policy = policy_iteration(
        num_non_terminal_states,
        sp_idx,
        p,
        r,
        policy,
        old_policy,
        convergence_threshold,
        gamma,
        maximum_num_policy_evaluations,
        v,
        q,
        maximum_num_sweeps)

    # Print final results
    print("\nFinal state value function")
    print(v)
    print("\nFinal state-action value function")
    print(q)
    print("\nFinal policy")
    print(policy)

In [14]:
run_algorithm()

State value function before sweep 0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


State-action value function before sweep 0
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


Policy before sweep 0
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


policy_stable = False at sweep 0

State value function before sweep 1
[-11.42591538 -16.29940807 -17.92681232 -11.42591538 -14.84125831
 -16.57033147 -16.60954761 -16.29940807 -16.57033147 -15.10598674
 -11.83929409 -17.92681232 -16.60954761 -11.83929409   0.
   0.        ]


State-action value function befo