In [1]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math 
from tqdm import trange
from matplotlib.table import Table

In [2]:
class GridWorld:
    def __init__(self,grid_size = 5,a_xy = [0,1], a_prime_xy = [4, 1], b_xy = [0, 3], 
                 b_prime_xy = [2, 3], gamma = 0.9, a_reward = 10, b_reward = 5, penalty = -1.0):
        self.grid_size = grid_size
        self.A_xy  = a_xy
        self.A_prime_xy = a_prime_xy
        self.A_reward = a_reward
        self.B_xy = b_xy
        self.B_prime_xy = b_prime_xy
        self.B_reward = b_reward
        self.discount = gamma
        self.actions = [np.array([0, -1]),
                   np.array([-1, 0]),
                   np.array([0, 1]),
                   np.array([1, 0])]
        self.action_prob = 1/len(self.actions)
        print('action prob : ',self.action_prob)
        self.penalty_reward = penalty
        self.num_states = self.grid_size*self.grid_size
        
    
    def step(self, state, action):
        if state == self.A_xy:
            return self.A_prime_xy, self.A_reward
        if state == self.B_xy:
            return self.B_prime_xy, self.B_reward
        next_state = (np.array(state) + action).tolist()
        x, y = next_state
        if x < 0 or x >= self.grid_size or y < 0 or y >= self.grid_size:
            reward = self.penalty_reward
            next_state = state
        else:
            reward = 0
        return next_state, reward
    
    
    def solveBellManLinear(self):
        A = np.zeros((self.num_states,self.num_states))
        B = np.zeros(self.num_states)
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                r = i * self.grid_size + j
                
                #print(r)
                tot_reward = 0
                for action in self.actions:
                    (next_i, next_j), reward = self.step([i, j], action)
                    if r == 1:
                        print(f'i:{i},j:{j}')
                        print(action)
                        print(f'next_i:{next_i},next_j:{next_j}')
                    c = next_i * self.grid_size + next_j
                    A[r,c] += self.discount * self.action_prob
                    tot_reward += self.action_prob * reward
                B_idx = i * self.grid_size + j
                if tot_reward != 0.0:
                    tot_reward *= -1
                B[B_idx] = tot_reward
                                
        print(A)
        print('*'*100)
        print(B)
        print('*'*100)
        print(np.linalg.det(A))
        #values = np.linalg.solve(A, B)
        #print(values.reshape(self.grid_size,self.grid_size))
        #print('*'*100)

In [3]:
def plotGrid(value,title):
    value = np.round(value, decimals=1)
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = value.shape
    width, height = 1.0 / ncols, 1.0 / nrows
    for (i, j), val in np.ndenumerate(value):
        tb.add_cell(i, j, width, height, text=val,
                    loc='center')
    ax.add_table(tb)
    plt.title(title)
    plt.show()
    

In [8]:
grid = GridWorld()
grid.solveBellManLinear()

action prob :  0.25
i:0,j:1
[ 0 -1]
next_i:4,next_j:1
i:0,j:1
[-1  0]
next_i:4,next_j:1
i:0,j:1
[0 1]
next_i:4,next_j:1
i:0,j:1
[1 0]
next_i:4,next_j:1
[[0.45  0.225 0.    0.    0.    0.225 0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.9   0.    0.
  0.   ]
 [0.    0.225 0.225 0.225 0.    0.    0.    0.225 0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.9   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.    0.    0.    0.225 0.45  0.    0.    0.    0.    0.225 0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.225 0.    0.    0.    0.    0.225 0.225 0.    0.    0.    0.225 0.
  0.    0.    0.    0.

In [5]:
def figure3_2():
    grid = GridWorld()
    value = grid.solveBellManLinear()
    title = 'v(s) with random policy'
    plotGrid(value,title)

In [6]:
def figure3_5():
    grid = GridWorld()
    value = grid.play(random_flag = False)
    title = r'$v_*(s)$' + ' with optimal policy'
    plotGrid(value,title)

In [7]:
figure3_2()
figure3_5()

action prob :  0.25
i:0,j:1
[ 0 -1]
next_i:4,next_j:1
i:0,j:1
[-1  0]
next_i:4,next_j:1
i:0,j:1
[0 1]
next_i:4,next_j:1
i:0,j:1
[1 0]
next_i:4,next_j:1
[[0.45  0.225 0.    0.    0.    0.225 0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.9   0.    0.
  0.   ]
 [0.    0.225 0.225 0.225 0.    0.    0.    0.225 0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.9   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.    0.    0.    0.225 0.45  0.    0.    0.    0.    0.225 0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   ]
 [0.225 0.    0.    0.    0.    0.225 0.225 0.    0.    0.    0.225 0.
  0.    0.    0.    0.

TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'