## Policy Improvement

The process of making a new policy that improves an original policy by making it greedy with respect to the value function of the original policy

In [7]:
import numpy as np

def policy_improvement(V, P, gamma=1.0):
    
    # initialize the Q-function to zero (technically, we
    # can initialize these randomly, but let’s keep things simple)
    Q = np.zeros((len(P), len(P[0])))
    
    # loop through the states, actions, and transitions
    # and calculate the action-value function
    for s in range(len(P)):
        for a in range(len(P[s])):
            for prob, next_state, reward, done in P[s][a]:
                Q[s][a] += prob * (reward + gamma * V[next_state])
                
    # obtain a new, greedy policy by taking the argmax of the Q-function            
    new_pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    
    return new_pi

We can consider the "careful" policy on the frozen-lake environment and its state value function:

In [8]:
import gymnasium as gym

frozen_lake = gym.make('FrozenLake-v1')
P = frozen_lake.env.unwrapped.P
goal_state = 15

LEFT, DOWN, RIGHT, UP = range(4)

In [9]:
careful_pi = lambda s: {
    0:LEFT, 1:UP, 2:UP, 3:UP,
    4:LEFT, 5:LEFT, 6:UP, 7:LEFT,
    8:UP, 9:DOWN, 10:LEFT, 11:LEFT,
    12:LEFT, 13:RIGHT, 14:RIGHT, 15:LEFT
}[s]

In [10]:
def evaluate(env, pi, goal_state, n_episodes=100, max_steps=200):
    success = 0;
    results = []
    for _ in range(n_episodes):
        done = False;
        steps = 0;
        state, _ = env.reset();
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _, _ = env.step(pi(state))
            results[-1] += reward;
            steps += 1
        if(state == goal_state):
            success += 1;
    return (success/n_episodes)*100, np.mean(results);

In [11]:
probability_success, mean_return = evaluate(frozen_lake, careful_pi, goal_state=goal_state);

print("Reaches goal ", probability_success, "%");
print("Obtains an average undiscounted return of ", mean_return);

Reaches goal  57.99999999999999 %
Obtains an average undiscounted return of  0.58


In [13]:
def policy_evaluation(pi, P, gamma=1.0, theta=1e-10):
    prev_V = np.zeros(len(P))
    while True:
        V = np.zeros(len(P))
        for s in range(len(P)):
            for prob, next_state, reward, done in P[s][pi(s)]:
                V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
        if np.max(np.abs(prev_V - V)) < theta:
            break
        prev_V = V.copy()
    return V

In [14]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [15]:
V = policy_evaluation(careful_pi, P, gamma=0.99)

In [16]:
print_state_value_function(V, P, prec=4)

State-value function:
| 00 0.4079 | 01 0.3754 | 02 0.3543 | 03 0.3438 |
| 04 0.4203 |           | 06 0.1169 |           |
| 08 0.4454 | 09  0.484 | 10 0.4328 |           |
|           | 13 0.5884 | 14 0.7107 |           |


Now we can try to improve the policy:

In [17]:
careful_plus_pi = policy_improvement(V, P, gamma=0.99)

We can show the improved policy, its probability of success and its main return using simulation:

In [18]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [19]:
print_policy(careful_plus_pi, P)

Policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |


In [20]:
probability_success, mean_return = evaluate(frozen_lake, careful_plus_pi, goal_state=goal_state);

print("Reaches goal ", probability_success, "%");
print("Obtains an average undiscounted return of ", mean_return);

Reaches goal  78.0 %
Obtains an average undiscounted return of  0.78


The new policy is better than the original policy. This is great! 

Is there a better policy than this one? We can try to improve the careful-plus policy:

In [21]:
V = policy_evaluation(careful_plus_pi, P, gamma=0.99)
careful_plus_plus_pi = policy_improvement(V, P, gamma=0.99)

In [22]:
print_policy(careful_plus_plus_pi, P)

Policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |


There’s no improvement this time, it is the same policy. The careful-plus policy is an optimal policy of the frozen-lake environment.

Even if we start with an **adversarial policy** (designed to perform poorly), alternating policy evaluation and improvement would still end up with an optimal policy:

In [24]:
adversarial_pi = lambda s: {
    0:UP, 1:UP, 2:UP, 3:UP,
    4:UP, 5:LEFT, 6:UP, 7:LEFT,
    8:LEFT, 9:LEFT, 10:LEFT, 11:LEFT,
    12:LEFT, 13:LEFT, 14:LEFT, 15:LEFT
}[s]

In [25]:
print_policy(adversarial_pi, P)

Policy:
| 00      ^ | 01      ^ | 02      ^ | 03      ^ |
| 04      ^ |           | 06      ^ |           |
| 08      < | 09      < | 10      < |           |
|           | 13      < | 14      < |           |


In [26]:
probability_success, mean_return = evaluate(frozen_lake, adversarial_pi, goal_state=goal_state);

print("Reaches goal ", probability_success, "%");
print("Obtains an average undiscounted return of ", mean_return);

Reaches goal  0.0 %
Obtains an average undiscounted return of  0.0


In [27]:
V = policy_evaluation(adversarial_pi, P, gamma=0.99)
adversarial_pi_2 = policy_improvement(V, P, gamma=0.99)

V = policy_evaluation(adversarial_pi_2, P, gamma=0.99)
adversarial_pi_3 = policy_improvement(V, P, gamma=0.99)

V = policy_evaluation(adversarial_pi_3, P, gamma=0.99)
adversarial_pi_4 = policy_improvement(V, P, gamma=0.99)

V = policy_evaluation(adversarial_pi_4, P, gamma=0.99)
adversarial_pi_5 = policy_improvement(V, P, gamma=0.99)

In [28]:
print_policy(adversarial_pi_5, P)

Policy:
| 00      < | 01      ^ | 02      > | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |


In [29]:
probability_success, mean_return = evaluate(frozen_lake, adversarial_pi_5, goal_state=goal_state);

print("Reaches goal ", probability_success, "%");
print("Obtains an average undiscounted return of ", mean_return);

Reaches goal  77.0 %
Obtains an average undiscounted return of  0.77
