# TX00DQ05-3001 Exercises 2

Note that you don't have to use the functions / other code in the cells below. They are there just in case you need inspiration to get started.

In [1]:
import numpy as np
print(np.__version__)
import numpy.linalg as LA

1.16.2


## Exercise 1: Iterative policy evaluation. 

Calculate state-value function V for the gridworld of Sutton & Barto example 4.1. Policy is assumed to be random, ie. each of the four directions are equally likely. Movement that would result in leaving the grid (for example moving up in top row) will leave state unchanged (but action has been taken). Gamma (discount factor) is assumed to be = 1, ie. no discounting.

When norm of the difference between new V and the old one is less than eps, stop iteration.

Compare needed number of iterations between synchronous (sweep over all states, and update value function after the sweep) and asynchronous (use always the latest values) update of state-value function.

Note that numpy tensor assignment does not create a copy. You might want to use .copy() method to avoid sharing a reference to the same array.

In [2]:
UP   = np.array((-1, 0))
DOWN = np.array((1, 0))
LEFT = np.array((0, -1))
RIGHT= np.array((0, 1))

DIRECTIONS = (UP, DOWN, LEFT, RIGHT)
DIRECTIONS_LABELS = ('↑', '↓', '←', '→')

In [3]:
# fully synchronous
rows_count = 4
columns_count = 4
V = np.zeros((rows_count, columns_count))
V_new = np.zeros((rows_count, columns_count))
terminating = [(0,0), (rows_count-1, columns_count-1)]
stepcost = -1

maxiters = 1000
eps = 0.0000001

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

for i in range(maxiters):
    V = V_new.copy()
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            V_new[row, col] = sum((-1 + V[take_action(row, col, action)] for action in DIRECTIONS)) / 4
    
    if LA.norm(V - V_new) < eps:
        break

print("iteration", i)
with np.printoptions(precision=3):
    print(V)

iteration 319
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


In [4]:
# on way between synchronous and asynchronous
# (V_new assignment and summing)
rows_count = 4
columns_count = 4
V = np.zeros((rows_count, columns_count))
V_new = np.zeros((rows_count, columns_count))
terminating = [(0,0), (rows_count-1, columns_count-1)]
stepcost = -1

maxiters = 1000
eps = 0.0000001

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

for i in range(maxiters):
    V = V_new.copy()
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            V_new[row, col] = sum((-1 + V_new[take_action(row, col, action)] for action in DIRECTIONS)) / 4
    
    if LA.norm(V - V_new) < eps:
        break

print("iteration", i)
with np.printoptions(precision=3):
    print(V)

iteration 205
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


In [5]:
# asynchronous
rows_count = 4
columns_count = 4
V = np.zeros((rows_count, columns_count))
V_new = np.zeros((rows_count, columns_count))
terminating = [(0,0), (rows_count-1, columns_count-1)]
stepcost = -1

maxiters = 1000
eps = 0.0000001

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

for i in range(maxiters):
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            V = V_new.copy()
            V_new[row, col] = sum((-1 + V[take_action(row, col, action)] for action in DIRECTIONS)) / 4
    
    if LA.norm(V - V_new) < eps:
        break

print("iteration", i)
with np.printoptions(precision=3):
    print(V)

iteration 185
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


In [6]:
# asynchronous
rows_count = 4
columns_count = 4
V = np.zeros((rows_count, columns_count))
V_new = np.zeros((rows_count, columns_count))
terminating = [(0,0), (rows_count-1, columns_count-1)]
stepcost = -1

maxiters = 1000
eps = 0.0000001

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

for i in range(maxiters):
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            V = V_new.copy()
            V_new[row, col] = sum((-1 + V[take_action(row, col, action)] for action in DIRECTIONS)) / 4
    
    if LA.norm(V - V_new) < eps:
        break

print("iteration", i)
with np.printoptions(precision=3):
    print(V)

iteration 185
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


## Exercise 2: Greedy policy. 

Based on the state-value function computed in exercise 1, print out deterministic greedy policy function. Is the policy generated also optimal one?

In [7]:
# greedy policy
policy = np.full_like(V, 'X', 'str')

for row in range(rows_count):
    for col in range(columns_count):
        if (row, col) in terminating:
            continue
        # if rewards are the same they do not need to be taken into account while greedily choosing a policy, as:
        # argmax([r + v1, r + v2, ..., r + vk]) = argmax([v1, v2, ..., vk])
        direction_index = np.argmax([V[take_action(row, col, action)] for action in DIRECTIONS])
        policy[row, col] = DIRECTIONS_LABELS[direction_index]

print(policy)

[['X' '←' '←' '←']
 ['↑' '↑' '←' '↓']
 ['↑' '↑' '↓' '↓']
 ['↑' '→' '→' 'X']]


Looks optimal to me :)

## Exercise 3: Value function and policy in modified gridworld.

Change the definition of the exercise 1 gridworld by assigning a cost of -8 to movement in "up" direction. Compute the value function and greedy policy based on the value function. Is the greedy policy optimal?

In [8]:
rows_count = 4
columns_count = 4
V = np.zeros((rows_count, columns_count))
V_new = np.zeros((rows_count, columns_count))
terminating = [(0,0), (rows_count-1, columns_count-1)]
stepcost = -1

maxiters = 1000
eps = 0.0000001

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

def get_cost(action):
    if action is UP:
        return -8
    return -1

for i in range(maxiters):
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            V = V_new.copy()
            V_new[row, col] = sum((get_cost(action) + V[take_action(row, col, action)] for action in DIRECTIONS)) / 4
    
    if LA.norm(V - V_new) < eps:
        break

print("iteration", i)
with np.printoptions(precision=3):
    print(V)

iteration 197
[[  0.  -38.5 -55.  -60.5]
 [-38.5 -49.5 -55.  -55. ]
 [-55.  -55.  -49.5 -38.5]
 [-60.5 -55.  -38.5   0. ]]


In [9]:
# greedy policy
policy = np.full_like(V, 'X', 'str')

for row in range(rows_count):
    for col in range(columns_count):
        if (row, col) in terminating:
            continue
        direction_index = np.argmax([get_cost(action) + V[take_action(row, col, action)] for action in DIRECTIONS])
        policy[row, col] = DIRECTIONS_LABELS[direction_index]

print(policy)

[['X' '←' '←' '←']
 ['↑' '←' '←' '↓']
 ['↑' '→' '↓' '↓']
 ['→' '→' '→' 'X']]


That policy is not optimal. Proof:
For state `(2, 3)` (one above lower left corner) the state value under the current policy (taking action `UP`) is *-16*.
A better policy would be to take the action `DOWN` in the state, with actions for the remaining states kept the same,
now the state value is *-4*.

## Extra exercise: Policy iteration

Implement policy iteration, ie. create a policy with the help of the value function from previous policy and iterate until policy is stable.

In [10]:
rows_count = 4
columns_count = 4
V = np.zeros((rows_count, columns_count))
V_new = np.zeros((rows_count, columns_count))
terminating = [(0,0), (rows_count-1, columns_count-1)]

policy = np.empty_like(V, dtype=np.object)
for i in range(len(policy.flat)):
    # np.random.choice was reluctant to work
    policy.flat[i] = DIRECTIONS[np.random.randint(len(DIRECTIONS))]
for r, c in terminating:
    policy[r,c] = None

maxiters = 1000
eps = 0.0000001

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

def get_cost(action):
    if action is UP:
        return -8
    return -1

for j in range(10):
    # policy evaluation
    for i in range(maxiters):
        for row in range(rows_count):
            for col in range(columns_count):
                if (row, col) in terminating:
                    continue
                V = V_new.copy()
                action = policy[row, col]
                V_new[row, col] = get_cost(action) + V[take_action(row, col, action)]

        if LA.norm(V - V_new) < eps:
            break

    # policy improvement
    policy_stable = True
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            old_action = policy[row, col]
            direction_index = np.argmax([get_cost(action) + V[take_action(row, col, action)] for action in DIRECTIONS])
            policy[row, col] = DIRECTIONS[direction_index]
            if old_action is not policy[row, col]:
                policy_stable = False
            
    if policy_stable:
        break

with np.printoptions(precision=3):
    print(V)

[[ 0. -1. -2. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]


In [11]:
# policy to arrows
pretty_policy = np.full_like(policy, 'X', 'str')

for row in range(rows_count):
    for col in range(columns_count):
        if (row, col) in terminating:
            continue
        # direction_index = DIRECTIONS.index(policy[row, col]) # this has problems with numpy
        direction_index = np.argmax([policy[row, col] is d for d in DIRECTIONS])
        pretty_policy[row, col] = DIRECTIONS_LABELS[direction_index]

print(pretty_policy)

[['X' '←' '←' '↓']
 ['↓' '↓' '↓' '↓']
 ['↓' '↓' '↓' '↓']
 ['→' '→' '→' 'X']]


That is indeed optimal

## Exercise 4: Value iteration

Solve the exercise 1 gridworld with value iteration algorithm. Solve also modified gridworld (cost of "up" movement = -4).

In [12]:
# value iteration, reward = -1
theta = 0.0001
maxiters = 10
V = np.zeros((rows_count,columns_count))

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

for i in range(maxiters):
    delta = 0
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            # the asynchronous would not be working the same way it did in the previous exercises
            # synchronous would work the same, but I was mood for something fresh from the book
            v = V[row, col]
            V[row, col] = max((-1 + V[take_action(row, col, action)] for action in DIRECTIONS))
            delta = max(delta, abs(v - V[row, col]))
    
    if delta < theta:
        break

with np.printoptions(precision=3):
    print(V)

[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]


In [13]:
# greedy policy
policy = np.full_like(V, 'X', 'str')

for row in range(rows_count):
    for col in range(columns_count):
        if (row, col) in terminating:
            continue
        direction_index = np.argmax([-1 + V[take_action(row, col, action)] for action in DIRECTIONS])
        policy[row, col] = DIRECTIONS_LABELS[direction_index]

print(policy)

[['X' '←' '←' '↓']
 ['↑' '↑' '↑' '↓']
 ['↑' '↑' '↓' '↓']
 ['↑' '→' '→' 'X']]


Again an optimal policy

In [14]:
# value iteration with modified cost of going up (-4)
theta = 0.0001
maxiters = 10
V = np.zeros((rows_count,columns_count))

def take_action(row, column, direction):
    return tuple(np.clip((row, column) + direction, 0, 3))

def get_cost(action):
    if action is UP:
        return -4
    return -1

for i in range(maxiters):
    delta = 0
    for row in range(rows_count):
        for col in range(columns_count):
            if (row, col) in terminating:
                continue
            v = V[row, col]
            V[row, col] = max((get_cost(action) + V[take_action(row, col, action)] for action in DIRECTIONS))
            delta = max(delta, abs(v - V[row, col]))
    
    if delta < theta:
        break

with np.printoptions(precision=3):
    print(V)

[[ 0. -1. -2. -3.]
 [-4. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]


In [15]:
# greedy policy
policy = np.full_like(V, 'X', 'str')

for row in range(rows_count):
    for col in range(columns_count):
        if (row, col) in terminating:
            continue
        direction_index = np.argmax([get_cost(action) + V[take_action(row, col, action)] for action in DIRECTIONS])
        policy[row, col] = DIRECTIONS_LABELS[direction_index]

print(policy)

[['X' '←' '←' '↓']
 ['↑' '↓' '↓' '↓']
 ['↓' '↓' '↓' '↓']
 ['→' '→' '→' 'X']]


Optimal (by eyeballing...). The value iteration is really neat and beautiful for this problem!