In [168]:
import numpy as np
import tensorflow as tf

# Create environment

In [41]:
number_of_states = 16
number_of_terminal_states = 2
number_of_non_terminal_states = number_of_states - number_of_terminal_states

In [42]:
max_number_of_actions = 4

In [43]:
number_of_actions_per_non_terminal_state = np.repeat(
    a=max_number_of_actions, repeats=number_of_non_terminal_states)

In [44]:
number_of_state_action_successor_states = np.repeat(
    a=1, repeats=number_of_states * max_number_of_actions)

In [45]:
number_of_state_action_successor_states = np.reshape(
    a=number_of_state_action_successor_states,
    newshape=(number_of_states, max_number_of_actions))

In [46]:
state_action_successor_state_indices = np.array(
    object=[1, 0, 14, 4,
            2, 1, 0, 5,
            2, 2, 1, 6,
            4, 14, 3, 7,
            5, 0, 3, 8,
            6, 1, 4, 9,
            6, 2, 5, 10,
            8, 3, 7, 11,
            9, 4, 7, 12,
            10, 5, 8, 13,
            10, 6, 9, 15,
            12, 7, 11, 11,
            13, 8, 11, 12,
            15, 9, 12, 13],
    dtype=np.int64)

In [47]:
state_action_successor_state_transition_probabilities = np.repeat(
    a=1.0, repeats=number_of_non_terminal_states * max_number_of_actions * 1)

In [48]:
state_action_successor_state_rewards = np.repeat(
    a=-1.0, repeats=number_of_non_terminal_states * max_number_of_actions * 1)

In [49]:
state_action_successor_state_indices = np.reshape(
    a=state_action_successor_state_indices,
    newshape=(number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_transition_probabilities = np.reshape(
    a=state_action_successor_state_transition_probabilities,
    newshape=(number_of_non_terminal_states, max_number_of_actions, 1))
state_action_successor_state_rewards = np.reshape(
    a=state_action_successor_state_rewards,
    newshape=(number_of_non_terminal_states, max_number_of_actions, 1))

# Set hyperparameters

In [179]:
discounting_factor_gamma = 1.0
convergence_threshold = 0.001
maximum_number_of_value_estimations = 20

# Create algorithm

In [180]:
# This function estimates the value functions
def value_estimation(
    number_of_non_terminal_states,
    state_action_successor_state_indices_tensor,
    state_action_successor_state_transition_probabilities,
    state_action_successor_state_rewards_tensor,
    convergence_threshold,
    discounting_factor_gamma,
    maximum_number_of_value_estimations,
    state_value_function_tensor,
    state_action_value_function_tensor):
    delta = np.finfo(np.float64).max
    number_of_value_estimations = 0

    def while_loop_condition(
        delta,
        number_of_value_estimations,
        state_value_function_tensor,
        state_action_value_function_tensor):
        return tf.logical_and(
            x=tf.greater_equal(x=delta, y=convergence_threshold),
            y=tf.less(
                x=number_of_value_estimations,
                y=maximum_number_of_value_estimations))

    def while_loop_body(
        delta,
        number_of_value_estimations,
        state_value_function_tensor,
        state_action_value_function_tensor):
        def value_non_terminal_state_for_loop(
            state_index,
            delta,
            number_of_value_estimations,
            state_value_function_tensor,
            state_action_value_function_tensor):
            # Cache state-value function for state state_index
            temp_state_value_function = tf.gather(
                params=state_value_function_tensor, indices=state_index)

            # Gather state action successor state slices
            state_action_successor_state_indices_tensor_slice = tf.gather(
                params=state_action_successor_state_indices_tensor,
                indices=state_index)
            state_action_successor_state_transition_probabilities_tensor_slice = tf.gather(
                params=state_action_successor_state_transition_probabilities_tensor,
                indices=state_index)
            state_action_successor_state_rewards_tensor_slice = tf.gather(
                params=state_action_successor_state_rewards_tensor,
                indices=
                state_index)

            # Update state-action value function based on successor states, transition probabilities, and rewards
            x = state_action_successor_state_transition_probabilities_tensor_slice * (state_action_successor_state_rewards_tensor_slice + discounting_factor_gamma * temp_state_value_function)
            y = state_action_successor_state_transition_probabilities_tensor_slice * (state_action_successor_state_rewards_tensor_slice + discounting_factor_gamma * tf.gather(params=state_value_function_tensor, indices=state_action_successor_state_indices_tensor_slice))

            state_action_value_function_tensor_updated = tf.squeeze(
                input=tf.where(
                    condition=state_action_successor_state_indices_tensor_slice == state_index,
                    x=x, y=y),
                axis = 1)

            # Update state-value function
            state_value_function_tensor_updated = tf.reduce_max(
                input_tensor=state_action_value_function_tensor_updated)

            # Update delta for convergence criteria to break while loop and update policy
            delta = tf.reduce_max(
                input_tensor=(delta,
                              tf.abs(x=temp_state_value_function - state_value_function_tensor_updated)))

            return (state_value_function_tensor_updated,
                    state_action_value_function_tensor_updated)

        # Replace non-terminal state for loop with map function
        (state_value_function_tensor,
         state_action_value_function_tensor) = tf.map_fn(
            fn=lambda x: value_non_terminal_state_for_loop(
                x,
                delta,
                number_of_value_estimations,
                state_value_function_tensor,
                state_action_value_function_tensor), 
            elems=tf.range(number_of_non_terminal_states), 
            dtype=(tf.float64, tf.float64))

        # Concat terminal state values back onto state value function
        state_value_function_tensor = tf.concat(
            values=[state_value_function_tensor,
                    tf.constant(
                        value=0.0,
                        shape=[number_of_terminal_states],
                        dtype=tf.float64)],
            axis=0)

        number_of_value_estimations += 1

        return (delta,
                number_of_value_estimations,
                state_value_function_tensor,
                state_action_value_function_tensor)

    (delta,
     number_of_value_estimations,
     state_value_function_tensor,
     state_action_value_function_tensor) = tf.while_loop(
        cond=while_loop_condition,
        body=while_loop_body,
        loop_vars=[delta,
                   number_of_value_estimations,
                   state_value_function_tensor,
                   state_action_value_function_tensor])

    return state_value_function_tensor, state_action_value_function_tensor

In [181]:
# This function greedily selects the policy based on the current value function
def greedy_policy_selection(
    state_action_successor_state_indices_tensor,
    state_action_successor_state_transition_probabilities_tensor,
    state_action_successor_state_rewards_tensor,
    policy_tensor,
    discounting_factor_gamma,
    state_value_function_tensor):
    def policy_non_terminal_state_for_loop(state_index, policy_tensor):
        # Gather state action successor state slices
        state_action_successor_state_indices_tensor_slice = tf.gather(
            params=state_action_successor_state_indices_tensor,
            indices=state_index)
        state_action_successor_state_transition_probabilities_tensor_slice = tf.gather(
            params=state_action_successor_state_transition_probabilities_tensor,
            indices=state_index)
        state_action_successor_state_rewards_tensor_slice = tf.gather(
            params=state_action_successor_state_rewards_tensor,
            indices=state_index)

        # Update policy greedily from state-value function
        policy_tensor_updated = tf.squeeze(
            input=state_action_successor_state_transition_probabilities_tensor_slice * (state_action_successor_state_rewards_tensor_slice + discounting_factor_gamma * tf.gather(params=state_value_function_tensor, indices=state_action_successor_state_indices_tensor_slice)),
            axis=1)

        # Save max policy value and find the number of actions that have the
        # same max policy value
        max_policy_value = tf.reduce_max(input_tensor=policy_tensor_updated)
        max_policy_count = tf.count_nonzero(
            input_tensor=tf.equal(x=policy_tensor_updated, y=max_policy_value))

        # Apportion policy probability across ties equally for state-action
        # pairs that have the same value and zero otherwise
        x = tf.fill(
            dims=[max_number_of_actions],
            value=1.0 / tf.cast(max_policy_count, dtype=tf.float64))
        y = tf.cast(
            tf.fill(
                dims=[max_number_of_actions], value=0.0),
            dtype=tf.float64)

        policy_tensor_updated = tf.where(
            condition=tf.equal(
                x=policy_tensor_updated, y=max_policy_value), x=x, y=y)

        return policy_tensor_updated

    # Replace non-terminal state for loop with map function
    policy_tensor = tf.map_fn(
        fn=lambda x: policy_non_terminal_state_for_loop(x, policy_tensor),
        elems=tf.range(number_of_non_terminal_states),
        dtype=tf.float64)

    return policy_tensor

In [182]:
def value_iteration(
    number_of_non_terminal_states,
    state_action_successor_state_indices_tensor,
    state_action_successor_state_transition_probabilities_tensor,
    state_action_successor_state_rewards_tensor,
    policy_tensor,
    convergence_threshold,
    discounting_factor_gamma,
    maximum_number_of_value_iterations,
    state_value_function_tensor,
    state_action_value_function_tensor):
    # Value estimation
    state_value_function_tensor, state_action_value_function_tensor = value_estimation(
        number_of_non_terminal_states,
        state_action_successor_state_indices_tensor,
        state_action_successor_state_transition_probabilities_tensor,
        state_action_successor_state_rewards,
        convergence_threshold,
        discounting_factor_gamma,
        maximum_number_of_value_iterations,
        state_value_function_tensor,
        state_action_value_function_tensor)

    # Greedy policy selection
    policy_tensor = greedy_policy_selection(
        state_action_successor_state_indices_tensor,
        state_action_successor_state_transition_probabilities_tensor,
        state_action_successor_state_rewards_tensor,
        policy_tensor,
        discounting_factor_gamma,
        state_value_function_tensor)

    return (state_value_function_tensor,
            state_action_value_function_tensor,
            policy_tensor)

# Run algorithm

In [183]:
with tf.Session() as sess:
    # Read in environment
    state_action_successor_state_indices_tensor = tf.placeholder(
        dtype=tf.int64,
        shape=[number_of_non_terminal_states,
               max_number_of_actions,
               max_number_of_state_action_successor_states])
    state_action_successor_state_transition_probabilities_tensor = tf.placeholder(
        dtype=tf.float64,
        shape=[number_of_non_terminal_states,
               max_number_of_actions,
               max_number_of_state_action_successor_states])
    state_action_successor_state_rewards_tensor = tf.placeholder(
        dtype=tf.float64,
        shape=[number_of_non_terminal_states,
               max_number_of_actions,
               max_number_of_state_action_successor_states])

    # Create value functions
    state_value_function_tensor = tf.zeros(
        shape=number_of_states, dtype=tf.float64)
    state_action_value_function_tensor = tf.zeros(
        shape=[number_of_non_terminal_states, max_number_of_actions],
        dtype = tf.float64)

    # Create policy
    policy_tensor = tf.tile(
        input=[tf.constant(
            value = 1.0 / max_number_of_actions, dtype = tf.float64)],
        multiples=[number_of_non_terminal_states * max_number_of_actions])
    policy_tensor = tf.reshape(
        tensor=policy_tensor,
        shape=[number_of_non_terminal_states, max_number_of_actions])

    # Create algorithm
    algorithm = value_iteration(
        number_of_non_terminal_states,
        state_action_successor_state_indices_tensor,
        state_action_successor_state_transition_probabilities_tensor,
        state_action_successor_state_rewards_tensor,
        policy_tensor,
        convergence_threshold,
        discounting_factor_gamma,
        maximum_number_of_value_estimations,
        state_value_function_tensor,
        state_action_value_function_tensor)

    # Run graph
    (state_value_function,
     state_action_value_function,
     policy) = sess.run(
        fetches=algorithm,
        feed_dict={
            state_action_successor_state_indices_tensor: state_action_successor_state_indices, 
            state_action_successor_state_transition_probabilities_tensor: state_action_successor_state_transition_probabilities, 
            state_action_successor_state_rewards_tensor: state_action_successor_state_rewards
        }
    )

print("\nFinal state value function")
print(state_value_function)
print("\nFinal state-action value function")
print(state_action_value_function)
print("\nFinal policy")
print(policy)


Final state value function
[-1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.  0.]

Final state-action value function
[[-3. -2. -1. -3.]
 [-4. -3. -2. -4.]
 [-4. -4. -3. -3.]
 [-3. -1. -2. -3.]
 [-4. -2. -2. -4.]
 [-3. -3. -3. -3.]
 [-3. -4. -4. -2.]
 [-4. -2. -3. -4.]
 [-3. -3. -3. -3.]
 [-2. -4. -4. -2.]
 [-2. -3. -3. -1.]
 [-3. -3. -4. -4.]
 [-2. -4. -4. -3.]
 [-1. -3. -3. -2.]]

Final policy
[[0.   0.   1.   0.  ]
 [0.   0.   1.   0.  ]
 [0.   0.   0.5  0.5 ]
 [0.   1.   0.   0.  ]
 [0.   0.5  0.5  0.  ]
 [0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   1.   0.   0.  ]
 [0.25 0.25 0.25 0.25]
 [0.5  0.   0.   0.5 ]
 [0.   0.   0.   1.  ]
 [0.5  0.5  0.   0.  ]
 [1.   0.   0.   0.  ]
 [1.   0.   0.   0.  ]]
