This is the gridworld from example 3.8 in the book. 

## Setup

In [1]:
import numpy as np
import pandas as pd 
from functools import partial
from itertools import permutations

In [3]:
## Parameters 
nrow = 5
ncol = 5 
k = 1  # number of moves between policy iterations
gamma = 0.9  # discount factor

In [6]:
# Setup gameboard 
grid = np.zeros((nrow, ncol))
# Add teleporters - start, end, score
tele_start_l = [(0,1), (0,3)]
tele_end_l   = [(4,1), (2,3)]
tele_score_l = [10   ,  5   ]

In [7]:
# Start policy 
pi = [0.25, 0.25, 0.25, 0.25]
# We have a different pi for each square now 
pi_list = [[pi for j in range(ncol)] for i in range(nrow)]
# Actions - up, down, left, right
actions = [(-1,0), (1,0), (0,-1), (0,1)]

## Policy evaluation 

In [8]:
def is_on_grid(pos): 
    if min(pos) < 0 or pos[0] >= nrow or pos[1] >= ncol:   
        return False
    return True

In [20]:
def eval_action(pos, action): 
    """ move, score, and deal with teleporters
    pos, action are tuples. 
    pos is where you are now
    action is the coordinate change"""
    reward = 0 
    ### teleporter case
    for i in range(len(tele_start_l)):
        if pos == tele_start_l[i]:     
            new_pos = tele_end_l[i]; 
            reward += tele_score_l[i]
    else: 
        ### no teleporter, make valid move 
        new_pos = tuple(np.add(pos,action))
        if is_on_grid(new_pos) == False:   reward -=1 
        if min(new_pos) < 0:      new_pos = (np.max([0, new_pos[0]]), 
                                             np.max([0, new_pos[1]]))
        elif new_pos[0] >= nrow:  new_pos = ((nrow-1), new_pos[1])
        elif new_pos[1] >= ncol:  new_pos = (new_pos[0], (ncol-1))
    return (new_pos, reward)

In [21]:
def bellman_update(idx, reward_l, new_pos_l,pi):
    return (pi[idx]*(reward_l[idx] + gamma * v[new_pos_l[idx]]))

In [22]:
def update_value_for_square(pos,v,pi):
    tmp = [eval_action(pos, action) for action in actions]
    new_pos_l = [o[0] for o in tmp]
    reward_l  = [o[1] for o in tmp]
    # update the value function
    v[pos] = sum(map(partial(bellman_update, reward_l=reward_l, 
                              new_pos_l = new_pos_l, pi=pi), range(len(pi))))
    return(v)

In [29]:
pos_l = [(x,y) for x in range(nrow) for y in range(ncol)]
v = np.zeros((nrow,ncol))
k=1000
for i in range(k): 
    for pos in pos_l: 
        v = update_value_for_square(pos,v,pi)

In [30]:
v.round(1)

array([[ 10. ,  21.1,  11.3,  12.6,   5.6],
       [  5.6,   8.1,   6.2,   5.5,   3.2],
       [  2.3,   3.2,   2.8,   2.3,   1.2],
       [  0.3,   0.9,   0.9,   0.5,  -0.2],
       [ -1. ,  -0.5,  -0.4,  -0.6,  -1.2]])

## Policy iteration

Where we define a new policy based on the value function. 

In [387]:
for pos in pos_l: 
    legal_moves = [is_on_grid(tuple(np.add(pos,action))) for action in actions]
    values = [v[tuple(np.add(pos,act))] 
              if legal else -99999999 for act, legal in zip(actions, legal_moves)]
    max_val = max(values)
    n_max_values = sum([1 if o == max_val else 0 for o in values])
    pi_tmp = [1/n_max_values if o == max_val else 0 for o in values]
    pi_list[pos[0]][pos[1]] = pi_tmp