#### iterative policy evaluation - markov DP

In [2]:
import gym
import lib
import numpy as np

In [3]:
GAMMA = 1.0
THETA = 1e-5

In [34]:
'''
Calculate the state value matrix for a given policy in a given environment
- policy array
- env environment
- returns array state value matrix
Steps: 
* values of states set to zero
* loop states: 
'''
def policy_evaluation(policy, env):
    
    num_states   = env.nS # number of states of the env
    num_action   = env.nA # number of actions of the env
    transitions = env.P # transition to the next state
    
    V = np.array(np.zeros(num_states)) # initialize state Value matrix 
    
    while True:
        delta = 0
        for state in range(num_states):
            new_value = 0
            # update rule : V(s) = sum(pi(a|s)) * sum(p(s,a))* [r+gamma*V(s')]
            # sum over actions
            for action, p_action in enumerate(policy[state]):
                # sum over s', r
                for probability, nextstate, reward, _ in transitions[state][action]:
                    new_value += p_action*probability*(reward+GAMMA*V[nextstate])
            delta = max(delta,np.abs(new_value-V[state]))
            V[state] = new_value
        if delta<THETA:
            break
    return V # return value evaluation for the given policy
            

In [14]:
env = gym.make('FrozenLake-v0')

In [18]:
env = env.env

In [23]:
random_policy = np.ones([env.nS,env.nA])/env.nA

In [24]:
random_policy

array([[0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25]])

In [35]:
v_k = policy_evaluation(random_policy, env)

In [39]:
v_k.shape

(16,)

In [37]:
print(v_k)

[0.013911   0.01161424 0.02094062 0.01046758 0.01623478 0.
 0.04074774 0.         0.03479961 0.08816698 0.14205099 0.
 0.         0.17581855 0.4392897  0.        ]
