#### policy improvement
* loop all states
* on each state
    * save old policy
    * create new policy maximizing current reward + next state reward
* until no policy step changes

In [1]:
import gym
import lib
import numpy as np

In [5]:
GAMMA  = 1
THETA =1e-5

#### policy iteration

In [21]:
def policy_iteration(policy, env):
    num_states    = env.nS
    num_actions   = env.nA
    transitions   = env.P
    policy_stable = True
    
    while policy_stable:
        V = policy_evaluation(policy, env) # create state value matrix for the given policy
        for state in range(num_states):
            old_action = np.argmax(policy[state])
            new_action_values = np.zeros(num_actions)
            for action in range(num_actions):
                for probability, nextstate, reward, _ in transitions[state][action]:
                    new_action_values[action] += probability*(reward+GAMMA*V[nextstate])
            new_action = np.argmax(new_action_values)
            policy_stable = (new_action == old_action)
            # replace policy[state] with a new vector
            policy[state] = np.eye(num_actions)[new_action]
        if policy_stable:
            return policy, V
        

#### policy evaluation

In [65]:
def policy_evaluation(policy, env):
    
    num_states   = env.nS # number of states of the env
    num_action   = env.nA # number of actions of the env
    transitions = env.P # transition to the next state
    
    V = np.array(np.zeros(num_states)) # initialize state Value matrix 
    
    for a in range(1000):
        delta = 0
        for state in range(num_states):
            new_value = 0
            # update rule : V(s) = sum(pi(a|s)) * sum(p(s,a))* [r+gamma*V(s')]
            # sum over actions
            for action, p_action in enumerate(policy[state]):
                # sum over s', r
                for probability, nextstate, reward, _ in transitions[state][action]:
                    new_value += p_action*probability*(reward+GAMMA*V[nextstate])
            delta = max(delta,np.abs(new_value-V[state]))
            V[state] = new_value
        if delta<THETA:
            break
    return V # return value evaluation for the given policy

In [69]:
env = gym.make('FrozenLake-v0')
env = env.env

In [70]:
random_policy = np.ones([env.nS,env.nA])/env.nA

In [71]:
v_k = policy_evaluation(random_policy, env)
print(v_k)

[0.013911   0.01161424 0.02094062 0.01046758 0.01623478 0.
 0.04074774 0.         0.03479961 0.08816698 0.14205099 0.
 0.         0.17581855 0.4392897  0.        ]


In [72]:
optimal_policy, optimal_value = policy_iteration(random_policy, env)
print(optimal_policy)

[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]


In [73]:
print(optimal_value)

[0.013911   0.01161424 0.02094062 0.01046758 0.01623478 0.
 0.04074774 0.         0.03479961 0.08816698 0.14205099 0.
 0.         0.17581855 0.4392897  0.        ]
