4x4 gridworld from example 4.1 

In [248]:
## Import packages and set initial variables   
import numpy as np
np.random.seed = 123
nrow = 4
ncol = 4

# This is our value function
v = np.zeros((nrow,ncol))

## Create the grid 
# The grid will be made up of empty strings except for the terminal states, 
# which will have 'T' 
grid = np.zeros((nrow,ncol), dtype='str')
grid[0,0] = 'T'
grid[nrow-1,ncol-1] = 'T'

# Set up the coordinate changes of moving up, down, left, right 
# Note: this is the oppposite of the xy-plane. Rows is the x-axis, 
# columns is the y-axis
actions = [(-1,0), (1,0), (0,-1), (0,1)]
# The cutoffs represent an equiprobable random policy 
# We select a random number later and the cutoffs are the ranges for 
# each action. 
cutoffs = np.array([0.25, 0.5, 0.75, 1])

In [256]:
## Calculating the value function by simulation 
n = 2000  # Number of episodes 
k = 10   # Maximum number of time steps per episode 
for x in range(nrow): 
    for y in range(ncol):
        G = np.zeros(n)  # Our return for each episode
        for i in range(n): 
            coord = [x,y]  # Starting position of the agent
            r=0; cnt=0  # Reset from last simulation 
            while grid[tuple(coord)] != 'T' and cnt < k: 
                # get next coordinate
                rnum = np.random.uniform()
                coord = np.add(coord, actions[np.min(np.where(rnum < cutoffs))])
                # adjust for going off the grid 
                coord[0] = max(0, coord[0]); coord[0] = min(coord[0], (nrow-1))
                coord[1] = max(0, coord[1]); coord[1] = min(coord[1], (ncol-1))
                # allocate reward, increase counter
                r += -1; cnt += 1
            G[i] = r
        # The value is the average return for that starting state. 
        v[x,y] = np.mean(G)

In [257]:
np.round(v,1)

array([[ 0. , -6.1, -8.4, -9. ],
       [-6.2, -7.8, -8.4, -8.3],
       [-8.3, -8.4, -7.8, -6.2],
       [-9. , -8.4, -6.2,  0. ]])

Knowing v, can we find q? Use the equation
$$ q_\pi (s, a) = \sum_{s'} p(s' | s,a) \left[ r(s,a,s') + \gamma v_\pi(s') \right] $$
The term $ p(s'|s,a) = 1$ for our case, so shouldn't be too hard. We will assume that $ \gamma $ is a parameter we can tweak. 

In [259]:
## Find the action-value function q
#
# Set up 3D array for q
# The first dimesnion holds the direction chosen. 
# Order is up, down, left, right
q = np.zeros((4, nrow, ncol)) 
gamma = 0.9  # discount rate parameter 
for x in range(nrow): 
    for y in range(ncol): 
        for i, action in enumerate(actions): 
            # Get coordinate of the next state s'
            s_prime = np.add((x,y), action)
            # Adjust for going off the grid
            s_prime[0] = max(0, s_prime[0]); s_prime[0] = min(s_prime[0], (nrow-1))
            s_prime[1] = max(0, s_prime[1]); s_prime[1] = min(s_prime[1], (ncol-1))
            # Allocate the action-value function 
            q[i,x,y] = -1 +  gamma * v[tuple(s_prime)] 

In [261]:
np.round(q,1)

array([[[-1. , -6.5, -8.6, -9.1],
        [-1. , -6.5, -8.6, -9.1],
        [-6.6, -8. , -8.6, -8.5],
        [-8.4, -8.6, -8. , -6.6]],

       [[-6.6, -8. , -8.6, -8.5],
        [-8.4, -8.6, -8. , -6.6],
        [-9.1, -8.5, -6.6, -1. ],
        [-9.1, -8.5, -6.6, -1. ]],

       [[-1. , -1. , -6.5, -8.6],
        [-6.6, -6.6, -8. , -8.6],
        [-8.4, -8.4, -8.6, -8. ],
        [-9.1, -9.1, -8.5, -6.6]],

       [[-6.5, -8.6, -9.1, -9.1],
        [-8. , -8.6, -8.5, -8.5],
        [-8.6, -8. , -6.6, -6.6],
        [-8.5, -6.6, -1. , -1. ]]])