# TX00DQ05-3001 Exercises 2

Note that you don't have to use the functions / other code in the cells below. They are there just in case you need inspiration to get started.

In [1]:
import numpy as np
print(np.__version__)
import numpy.linalg as LA

1.15.4


## Exercise 1: Iterative policy evaluation. 

Calculate state-value function V for the gridworld of Sutton & Barto example 4.1. Policy is assumed to be random, ie. each of the four directions are equally likely. Movement that would result in leaving the grid (for example moving up in top row) will leave state unchanged (but action has been taken). Gamma (discount factor) is assumed to be = 1, ie. no discounting.

When norm of the difference between new V and the old one is less than eps, stop iteration.

Compare needed number of iterations between synchronous (sweep over all states, and update value function after the sweep) and asynchronous (use always the latest values) update of state-value function.

Note that numpy tensor assignment does not create a copy. You might want to use .copy() method to avoid sharing a reference to the same array.

In [2]:
rows_count = 4
columns_count = 4

terminating = [(0,0), (rows_count-1, columns_count-1)]

stepcost = -1
prob_up = 0.25
prob_left = 0.25
prob_right = 0.25
prob_down = 0.25

maxiters = 1000
eps = 0.0000001

step_up = -1
step_down = -1
step_left = -1
step_right = -1

def valueFor(row, column, direction):
    # YOUR CODE HERE
    if direction == '↑':
        row -= 1
        if row < 0:
            row = 0
    elif direction == '←':
        column -= 1
        if column < 0:
            column = 0
    elif direction == '→':
        column += 1
        if column >= columns_count:
            column = columns_count-1
    elif direction == '↓':
        row += 1
        if row >= rows_count:
            row = rows_count-1
    return V[row, column]

def calculateValue(row, column):
    up = valueFor(row,column,'↑')
    left = valueFor(row,column,'←')
    right = valueFor(row,column,'→')
    down = valueFor(row,column,'↓')
    val = prob_up*(step_up+up) + prob_left*(step_left+left) + prob_right*(step_right+right) + prob_down*(step_down+down)
    return val

def resetMatrix():
    return np.zeros((rows_count, columns_count))

In [3]:
def forExercise1(sync):
    global V, V_new
    for i in range(maxiters):
    # YOUR CODE HERE
        if sync:
            V = V_new.copy()
        else:
            V = V_new
            #V is assigned to the same pointer as V_new, technically the same variables pointing to the same data
        norm = LA.norm(V)
        for r in range(rows_count):
            for c in range(columns_count):
                if not (r,c) in terminating:
                    V_new[r,c] = calculateValue(r,c)    
        new_norm = LA.norm(V_new)
        if np.absolute(new_norm - norm) <= eps:
            break
    if sync:
        print("Synchronous Iteration:", i)
    else:
        print("Asynchronous Iteration:", i)
    with np.printoptions(precision=3):
        print(V)
    return

V = resetMatrix()
V_new = resetMatrix()
forExercise1(sync=True)

V = resetMatrix()
V_new = resetMatrix()
forExercise1(sync=False)

Synchronous Iteration: 319
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]
Asynchronous Iteration: 205
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


## Exercise 2: Greedy policy. 

Based on the state-value function computed in exercise 1, print out deterministic greedy policy function. Is the policy generated also optimal one?

In [4]:
# YOUR CODE HERE
# ←↑↓→
def greedy(row, column):
    up = valueFor(row,column,'↑')
    left = valueFor(row,column,'←')
    right = valueFor(row,column,'→')
    down = valueFor(row,column,'↓')
    num_list = [up + step_up, left + step_left, right + step_right, down + step_down]
    max_val = max(num_list)
    directions = ''
    
    if left+step_left == max_val:
        directions += '←'
    if up+step_up == max_val:
        directions += '↑'
    if down+step_down == max_val:
        directions += '↓'
    if right+step_right == max_val:
        directions += '→'
    
    return directions

def calculatePolicy():
    global policy
    policy = []
    for r in range(rows_count):
        policy.append([])
        for c in range(columns_count):
            if not (r,c) in terminating:
                policy[r].append(greedy(r,c))
            else:
                policy[r].append('T')
    return

#Initiate policy
policy = []
for i in range(rows_count):
    policy.append([])
    
calculatePolicy()

with np.printoptions(precision=3):
    print(V)

for item in policy:
    print(item)
    
print("The policy is optimal. It goes through the highest-reward path.")

[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]
['T', '←', '←', '←']
['↑', '←', '←', '↓']
['↑', '↑', '↓→', '↓']
['↑', '→', '→', 'T']
The policy is optimal. It goes through the highest-reward path.


In [5]:
#Check why is there only one arrow
print(V[1][0])
print(V[0][1])
print(policy[1][1])

-13.99999978120095
-13.999999781200952
←


## Exercise 3: Value function and policy in modified gridworld.

Change the definition of the exercise 1 gridworld by assigning a cost of -8 to movement in "up" direction. Compute the value function and greedy policy based on the value function. Is the greedy policy optimal?

In [6]:
# YOUR CODE HERE
V = resetMatrix()
V_new = resetMatrix()

step_up = -8

for i in range(maxiters):
# YOUR CODE HERE
    V = V_new
    norm = LA.norm(V)
    for r in range(rows_count):
        for c in range(columns_count):
            if not (r,c) in terminating:
                V_new[r,c] = calculateValue(r,c)
    new_norm = LA.norm(V_new)
    if np.absolute(new_norm - norm) <= eps:
        break

calculatePolicy()

print("Iteration (Asynchronous):", i)
with np.printoptions(precision=3):
    print(V)
for item in policy:
    print(item)
    
print("The policy is not optimal since [0,2] should go right and/or down instead (because of -8 cost for going up).")

Iteration (Asynchronous): 216
[[  0.  -38.5 -55.  -60.5]
 [-38.5 -49.5 -55.  -55. ]
 [-55.  -55.  -49.5 -38.5]
 [-60.5 -55.  -38.5   0. ]]
['T', '←', '←', '←']
['↑', '←', '←', '↓']
['↑', '→', '↓→', '↓']
['→', '→', '→', 'T']
The policy is not optimal since [0,2] should go right and/or down instead (because of -8 cost for going up).


## Extra exercise: Policy iteration

Implement policy iteration, ie. create a policy with the help of the value function from previous policy and iterate until policy is stable.

In [7]:
# YOUR CODE HERE
# ←↑↓→
import copy

prob_up = 0
prob_left = 0
prob_right = 0
prob_down = 0
theta = 0.0000001

def calculateProb(direction):
    prob = 1/len(direction)
    global prob_left, prob_right, prob_up, prob_down
    if '←' in direction:
        prob_left = prob
    if '↑' in direction:
        prob_up = prob
    if '↓' in direction:
        prob_down = prob
    if '→' in direction:
        prob_right = prob
    return

def clearProb():
    global prob_left, prob_right, prob_up, prob_down
    prob_left = 0
    prob_right = 0
    prob_up = 0
    prob_down = 0
    return

for loop in range(maxiters):
    for i in range(maxiters):
        delta = 0
        for r in range(rows_count):
            for c in range(columns_count):
                if not (r,c) in terminating:
                    calculateProb(policy[r][c])
                    val = calculateValue(r,c)
                    clearProb()
                    if delta < np.absolute(V[r,c] - val):
                        delta = np.absolute(V[r,c] - val)
                    V_new[r,c] = val
        V = V_new
        if delta < theta:
            break
    
    #find new policy
    old_policy = copy.deepcopy(policy)
    calculatePolicy()
    if policy == old_policy:
        break
        
print('Iteration:',loop)
print(V)
for i in policy:
    print(i)

Iteration: 2
[[ 0. -1. -2. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]
['T', '←', '←', '←↓']
['↓→', '↓→', '↓→', '↓']
['↓→', '↓→', '↓→', '↓']
['→', '→', '→', 'T']


## Exercise 4: Value iteration

Solve the exercise 1 gridworld with value iteration algorithm. Solve also modified gridworld (cost of "up" movement = -4).

In [8]:
# value iteration
# ←↑↓→
theta = 0.0001
maxiters = 10

# YOUR CODE HERE
def forExercise4(step_up_value):
    global V, step_up
    step_up = step_up_value
    for i in range(maxiters):
        delta = 0
        for r in range(rows_count):
            for c in range(columns_count):
                if not (r, c) in terminating:
                    direction = greedy(r,c)
                    calculateProb(direction[0])
                    val = calculateValue(r,c)
                    clearProb()
                    if delta < np.absolute(V[r,c] - val):
                        delta = np.absolute(V[r,c] - val)
                    V[r,c] = val
        if delta < theta:
            break
    
    print('Iterations:',i)
    with np.printoptions(precision=3):
        print(V)

    return

V = resetMatrix()
forExercise4(step_up_value = -1)

V = resetMatrix()
forExercise4(step_up_value = -4)

Iterations: 3
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
Iterations: 4
[[ 0. -1. -2. -3.]
 [-4. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]
