# Policy Evaluation

This is a full implementation of the policy-evaluation algorithm. All we need is: the policy we’re trying to evaluate, the MDP, the discount factor, gamma, defaults to 1, and theta (a small number that we use to check for converge)

In [1]:
def policy_evaluation(pi, P, gamma=1.0, theta=1e-10):
    
    # initialize the first-iteration estimates to zero.
    prev_V = np.zeros(len(P))
    
    # looping forever...
    while True:
        # initialize the current-iteration estimates to zero as well.
        V = np.zeros(len(P))
        
        # loop through all states to estimate the state-value function
        for s in range(len(P)):
            
            # we use the policy pi to get the possible transitions,
            # each transition tuple has a probability, next state, 
            # reward, and a done flag indicating whether the next_state 
            # is terminal or not
            for prob, next_state, reward, done in P[s][pi(s)]:
                
                # calculate the value of that state by summing up the 
                # weighted value of that transition,
                # notice how we use the done flag to ensure the value of 
                # the next state when landing on a terminal state is zero
                V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
        
        # at the end of each iteration (a state sweep), we make sure 
        # that the state-value functions are changing; 
        # otherwise, we call it converged
        if np.max(np.abs(prev_V - V)) < theta:
            break
        
        # finally, copy to get ready for the next iteration or 
        prev_V = V.copy()
        
    # return the latest state-value function    
    return V

We can use the implemented algorthms to iterativelly evaluate the state value function for the "go-get-it" and "careful" policies in the frozen-lake environment. 

In [2]:
import gym

env = gym.make('FrozenLake-v0')
P = env.env.P
init_state = env.reset()
goal_state = 15

In [3]:
LEFT, DOWN, RIGHT, UP = range(4)

go_get_pi = lambda s: {
    0:RIGHT, 1:RIGHT, 2:DOWN, 3:LEFT,
    4:DOWN, 5:LEFT, 6:DOWN, 7:LEFT,
    8:RIGHT, 9:RIGHT, 10:DOWN, 11:LEFT,
    12:LEFT, 13:RIGHT, 14:RIGHT, 15:LEFT
}[s]

careful_pi = lambda s: {
    0:LEFT, 1:UP, 2:UP, 3:UP,
    4:LEFT, 5:LEFT, 6:UP, 7:LEFT,
    8:UP, 9:DOWN, 10:LEFT, 11:LEFT,
    12:LEFT, 13:RIGHT, 14:RIGHT, 15:LEFT
}[s]

We can prepare some functions to show a policy and to calculate its probability of success and its main return using simulation.

In [4]:
import numpy as np

def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [5]:
import random

def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        while not done and steps < max_steps:
            state, _, done, h = env.step(pi(state))
            steps += 1
        results.append(state == goal_state)
    return np.sum(results)/len(results)

In [6]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _ = env.step(pi(state))
            results[-1] += reward
            steps += 1
    return np.mean(results)

We can show first "go-get-it" values:

In [7]:
print_policy(pi=go_get_pi, P=P, title='go-get-it policy')

ps = probability_success(env, go_get_pi, goal_state=goal_state)*100
mr = mean_return(env, go_get_pi)

print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(ps,mr))

go-get-it policy
| 00      > | 01      > | 02      v | 03      < |
| 04      v |           | 06      v |           |
| 08      > | 09      > | 10      v |           |
|           | 13      > | 14      > |           |
Reaches goal 5.00%. Obtains an average undiscounted return of 0.0500.


Now, we can show the "careful" policy:

In [8]:
print_policy(pi=careful_pi, P=P, title='careful policy')

ps = probability_success(env, careful_pi, goal_state=goal_state)*100
mr = mean_return(env, careful_pi)

print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(ps,mr))

careful policy
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      ^ |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      > |           |
Reaches goal 52.00%. Obtains an average undiscounted return of 0.5200.


Let’s now run policy evaluation for the two policies and prepare a function to show the result.

In [9]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [10]:
V = policy_evaluation(go_get_pi, P, gamma=0.99)
print_state_value_function(V, P, prec=4)

State-value function:
| 00 0.0342 | 01 0.0231 | 02 0.0468 | 03 0.0231 |
| 04 0.0463 |           | 06 0.0957 |           |
| 08  0.094 | 09 0.2386 | 10 0.2901 |           |
|           | 13 0.4329 | 14 0.6404 |           |


In [11]:
V = policy_evaluation(careful_pi, P, gamma=0.99)
print_state_value_function(V, P, prec=4)

State-value function:
| 00 0.4079 | 01 0.3754 | 02 0.3543 | 03 0.3438 |
| 04 0.4203 |           | 06 0.1169 |           |
| 08 0.4454 | 09  0.484 | 10 0.4328 |           |
|           | 13 0.5884 | 14 0.7107 |           |


It seems being a go-get-it policy doesn’t pay well in the frozen-lake environment! Fascinating results, but a question arises: are there any better policies for this environment?