In [None]:
from IPython.core.debugger import set_trace
import numpy as np
import pprint
import sys
if "../" not in sys.path:
    sys.path.append("../")
from lib.envs.gridworld import GridworldEnv

In [None]:
pp = pprint.PrettyPrinter(indent=2)
env = GridworldEnv()

In [None]:
def Value_Iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Algorithm for value iteration. Value iteration means corresponding to each state, we calculate
    the value for each action that is possible and update the best action based on the value.

    Args:
        env: OpenAI env. env.P represents the trasition probabilities for the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is the number of states in the environment.
            env.nA is the number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states

    Returns:
        a tuple (policy, V) of the optimal policy and the optimal value function.
    """

    def one_step_lookahed(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, vector of length env.nS
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor*V[next_state])

        return A
    
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            A = one_step_lookahed(s, V)
            best_action_value = np.max(A)
            delta = max(delta, np.abs(best_action_value - V[s]))
            V[s] = best_action_value
        if delta < theta:
            break
    
    #Create a deterministic policy
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        A = one_step_lookahed(s, V)
        best_action = np.argmax(A)
        policy[s, best_action] = 1.0

    return policy, V

