In [1]:
# reinforcement learning, action, reward, robot

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

size = 5
a = [0, 1]
aPrime = [4, 1]
b = [0, 3]
bPrime = [2, 3]
discount = 0.50

actions = [np.array([0, -1]), np.array([-1, 0]), np.array([0, 1]), np.array([1, 0])]
actionArrowsS=[ '←', '↑', '→', '↓']
actionProb = 0.25


def actionValue(state, action):
    if state == a:
        return aPrime, 10
    if state == b:
        return bPrime, 5

    next_state = (np.array(state) + action).tolist()
    x, y = next_state
    if x < 0 or x >= size or y < 0 or y >= size:
        reward = -1.0
        next_state = state
    else:
        reward = 0
    return next_state, reward


def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])
    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows
    for (i, j), val in np.ndenumerate(image):
     
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')
        
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')

    ax.add_table(tb)

def draw_policy(optimal_values):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = optimal_values.shape
    width, height = 1.0 / ncols, 1.0 / nrows

   
    for (i, j), val in np.ndenumerate(optimal_values):
        next_vals=[]
        for action in actions:
            next_state, _ = actionValue([i, j], action)
            next_vals.append(optimal_values[next_state[0],next_state[1]])

        best_actions=np.where(next_vals == np.max(next_vals))[0]
        val=''
        for ba in best_actions:
            val+=actionArrowsS[ba]        
        tb.add_cell(i, j, width, height, text=val,
                loc='center', facecolor='white')

    # Row and column labels...
    for i in range(len(optimal_values)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                   edgecolor='none', facecolor='none')

    ax.add_table(tb)


def figure_3_2_linear_system():
  
    A = -1 * np.eye(size * size)
    b = np.zeros(size * size)
    for i in range(size):
        for j in range(size):
            s = [i, j]  # current state
            index_s = np.ravel_multi_index(s, (size, size))
            for a in actions:
                s_, r = actionValue(s, a)
                index_s_ = np.ravel_multi_index(s_, (size, size))

                A[index_s, index_s_] += actionProb * discount
                b[index_s] -= actionProb * r

    x = np.linalg.solve(A, b)
    draw_image(np.round(x.reshape(size, size), decimals=2))
    plt.savefig('a.png')
    plt.close()

def figure_3_5():
    value = np.zeros((size, size))
    while True:
        # keep iteration until convergence
        new_value = np.zeros_like(value)
        for i in range(size):
            for j in range(size):
                values = []
                for action in actions:
                    (next_i, next_j), reward = actionValue([i, j], action)
                    # value iteration
                    values.append(reward + discount * value[next_i, next_j])
                new_value[i, j] = np.max(values)
        if np.sum(np.abs(new_value - value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('b.png')
            plt.close()
            draw_policy(new_value)
            plt.savefig('c.png')
            plt.close()
            break
        value = new_value



figure_3_2_linear_system()
figure_3_5()