In [7]:
#Psuedocode:
#Input : policy pi to be evaluated
#Parameter: a small threshold theta > 0 determining accuracy of estimation
#Initialize V(state) for all states, arbitrarily except that V(terminal) = 0
#Loop:
#    delta = 0
#    Loop for each state:
#        v=V(state)
#        V(s) = sum(p(action|state)) * sum(p(new_state,reward| state,action) * (reward + gamma * V(new_state))
#        delta = max(delta,| v- V(state)|)
#    until delta < theta 

Exercise refernce: https://github.com/dennybritz/reinforcement-learning/blob/master/DP/Policy%20Evaluation.ipynb

In [8]:
import gym
import numpy as np
import sys
if "../" not in sys.path:
    sys.path.append("../") 

from lib.envs.gridworld import GridworldEnv

env = GridworldEnv()

In [14]:
def policy_evaluation(policy, env, gamma=0.9, theta=0.00001):
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for state in range(0, env.nS):
            v = 0
            for action, action_prob in enumerate(policy[state]):
                for prob, next_state, reward, done in env.P[state][action]:
                    v += action_prob * prob * (reward + gamma *V[next_state])
            delta = max(delta, np.abs(v-V[state]))
        if delta < theta:
            break
    return np.array(V)


In [None]:
random_policy = np.ones([env.nS, env.nA])/env.nA
v = policy_evaluation(random_policy, env)

In [None]:
# Test: Make sure the evaluated policy is what we expected
expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)