In [1]:
import numpy as np
import matplotlib

Defining a grid:

- `@`: Unreachable square
- `.`: Reachable square

Define the string describing the grid as `grid_str` \
Also define the self-explanatory `grid_shape`, and `grid_reward`

In [2]:
grid_str = \
    """
    @@@@@@@@@@@@@
    @.....@.....@
    @.....@.....@
    @...........@
    @.....@.....@
    @.....@.....@
    @@.@@@@.....@
    @.....@@@.@@@
    @.....@.....@
    @.....@.....@
    @...........@
    @.....@.....@
    @@@@@@@@@@@@@
    """

grid_shape = (13, 13)
grid_reward = np.zeros(grid_shape, dtype=np.float32)
grid_reward[1, -2] = 1

In [3]:
class Grid:
    def __init__(self, grid_str, grid_shape, grid_reward):
        m, n = grid_shape
        grid_str = grid_str.strip().split('\n')
        grid_str = [line.strip() for line in grid_str]
        self._is_occupied = np.array([[True if grid_str[i][j] == '@' else False for i in range(n)] for j in range(m)], dtype=np.uint8)
        self.grid_shape = grid_shape
        self._reward = grid_reward
    
    def is_occupied(self, i, j):
        return self._is_occupied[i, j]
    
    def reward_at(self, i, j):
        return self._reward[i, j]
    
    def actions(self, i, j):
        m, n = self.grid_shape
        actions = []
        if i > 0 and not self.is_occupied(i - 1, j):
            actions.append([-1, 0])
        if i < m - 1 and not self.is_occupied(i + 1, j):
            actions.append([1, 0])
        if j > 0 and not self.is_occupied(i, j - 1):
            actions.append([0, -1])
        if j < n - 1 and not self.is_occupied(i, j + 1):
            actions.append([0, 1])
        return [np.array(action, dtype=np.int32) for action in actions]

In [4]:
Grid1 = Grid(grid_str, grid_shape, grid_reward)

In [5]:
V = np.zeros(grid_shape, dtype=np.float32)
gamma = np.float32(0.9)

while True:
    V_old = V.copy()
    for i in range(grid_shape[0]):
        for j in range(grid_shape[1]):
            if Grid1.is_occupied(i, j):
                V[i, j] = 0
            else:
                actions = Grid1.actions(i, j)
                V[i, j] = max([Grid1.reward_at(i, j) + gamma * V_old[i + action[0], j + action[1]] for action in actions])
                V[i, j] = round(V[i, j], 2)
    if np.array_equal(V, V_old): break

print(V)

[[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   1.48 1.65 1.83 2.03 2.26 0.   3.44 3.82 4.25 4.72 5.25 0.  ]
 [0.   1.65 1.83 2.03 2.26 2.51 2.79 3.1  3.44 3.82 4.25 4.72 0.  ]
 [0.   1.48 1.65 1.83 2.03 2.26 0.   2.79 3.1  3.44 3.82 4.25 0.  ]
 [0.   1.33 1.48 1.65 1.83 2.03 0.   2.51 2.79 3.1  3.44 3.82 0.  ]
 [0.   1.2  1.33 1.48 1.65 1.83 0.   2.26 2.51 2.79 3.1  3.44 0.  ]
 [0.   0.   0.   1.33 0.   0.   0.   0.   0.   0.   2.79 0.   0.  ]
 [0.   0.97 1.08 1.2  1.08 0.97 1.08 0.   2.03 2.26 2.51 2.26 0.  ]
 [0.   0.87 0.97 1.08 0.97 1.08 1.2  0.   1.83 2.03 2.26 2.03 0.  ]
 [0.   0.78 0.87 0.97 1.08 1.2  1.33 1.48 1.65 1.83 2.03 1.83 0.  ]
 [0.   0.7  0.78 0.87 0.97 1.08 1.2  0.   1.48 1.65 1.83 1.65 0.  ]
 [0.   0.63 0.7  0.78 0.87 0.97 1.08 0.   1.33 1.48 1.65 1.48 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]]
