In [1]:
from rlgridworld.standard_grid import create_standard_grid
from rlgridworld.algorithms import iterative_policy_evaluation

In [2]:
gw = create_standard_grid()

In [3]:
policy = { 
    (0,0):'up', (0,1):'right',(0,2):'right',(0,3):'up',
    (1,0):'up', (1,1):'', (1,2):'right', (1,3):'',
    (2,0):'right', (2,1):'right', (2,2):'right', (2,3):''
    }
print("Input Policy")
gw.print_policy(policy)

Input Policy
-------------------------------------
|  Right |  Right |  Right |        |
-------------------------------------
|     Up |        |  Right |        |
-------------------------------------
|     Up |  Right |  Right |     Up |
-------------------------------------


Perform iterative policy evaluation to compute values for the policy

In [4]:
iterative_policy_evaluation(gw, policy, gamma = 0.9)

In [5]:
print("Values for the input policy")
gw.print_values()

Values for the input policy
-------------------------------------
|   0.81 |   0.90 |   1.00 |   0.00 |
-------------------------------------
|   0.73 |   0.00 |  -1.00 |   0.00 |
-------------------------------------
|   0.66 |  -0.81 |  -0.90 |  -1.00 |
-------------------------------------


The function below takes the values in the grid and finds the cooresponding policy

In [6]:
def compute_policy_from_values(gw, gamma = 0.9):
    # create null policy dictionary
    policy = {}
    # loop over all states
    for i in range(gw.M):
        for j in range(gw.N):
            state = (i,j)
            # assign 'no' policy to barrier states, there are no actions at barrier states
            if gw.is_barrier(state):
                policy[state] = ''
            # assign 'no' policy to terminal sttes, there are no actions at terminal states 
            if gw.is_terminal(state):
                policy[state] = ''
            # for all non terminal and non barrier states
            if not gw.is_terminal(state) and not gw.is_barrier(state):
                # set candidate best action and best value
                best_action = None
                best_value = float('-inf')
                # get dictionary of all valid decisions and rewards at current state (i,j)
                dr = gw.valid_decisions_and_rewards(state)
                # iterate over all action, reward in 
                for action, reward in dr.items():
                    # get reward for current action
                    reward = gw.get_reward_for_action(state,action)
                    # get the value of the destination state for the current action
                    value_at_dest = gw.get_value_at_destination(state,action)
                    # compute candidate vale
                    value = reward + gamma*value_at_dest
                    # if value is better, then update best action and best value
                    if value > best_value:
                        best_value = value
                        best_action = action
                # add best action to the policy dictionary 
                policy[state] = best_action
    return policy

Find new policy given the updated values

In [7]:
new_policy = compute_policy_from_values(gw)

In [8]:
print("Original Policy")
gw.print_policy(policy)
print("")
print("New Policy")
gw.print_policy(new_policy)

Original Policy
-------------------------------------
|  Right |  Right |  Right |        |
-------------------------------------
|     Up |        |  Right |        |
-------------------------------------
|     Up |  Right |  Right |     Up |
-------------------------------------

New Policy
-------------------------------------
|  Right |  Right |  Right |        |
-------------------------------------
|     Up |        |     Up |        |
-------------------------------------
|     Up |   Left |   Left |   Left |
-------------------------------------
