In [1]:
from grid_world import GridWorld, Action
import numpy as np

In [2]:
gamma = 0.9
rows = 5
cols = 5

In [16]:
env = GridWorld(
    desc=[".....", ".##..", "..#..", ".#T#.", ".#..."],
    forbidden_score=-10,
    terminal_score=1,
)
print("init grid:")
env.render_grid()
print("init policy:")
state_values = np.zeros(rows * cols)
q_table = np.zeros((rows * cols, len(Action)))
policy = np.argmax(q_table, axis=1)
env.render_policy(policy)


init grid:
⬜️⬜️⬜️⬜️⬜️
⬜️🚫🚫⬜️⬜️
⬜️⬜️🚫⬜️⬜️
⬜️🚫✅🚫⬜️
⬜️🚫⬜️⬜️⬜️
init policy:
⬆️⬆️⬆️⬆️⬆️
⬆️⏫️⏫️⬆️⬆️
⬆️⬆️⏫️⬆️⬆️
⬆️⏫️✅⏫️⬆️
⬆️⏫️⬆️⬆️⬆️


## random policy

In [17]:
policy = np.random.choice(len(Action), size=rows * cols)
env.render_policy(policy)

⬇️⬆️⬆️🔄⬅️
⬆️⏩️⏬🔄⬆️
⬇️⬇️⏬➡️⬇️
⬅️⏫️✅⏬⬆️
⬇️🔄⬇️⬇️⬅️


## Policy Iteration

In [22]:
max_iters = 1000
theta = 1e-3
state_values = np.zeros(env.get_state_space_size())
old_state_values = state_values + 1

for iter in range(max_iters):
    delta = np.sum((old_state_values - state_values) ** 2)
    print(f"iter {iter} Euclidean Distance:{delta}")

    old_state_values = state_values.copy()
    for state in range(env.get_state_space_size()):
        
        # kind 1
        for action in range(env.get_action_space_size()):
            reward, next_state = env.get_reward(state, action)
            q_table[state][action] = reward + gamma * state_values[next_state]

        # kind 2
        # now_action = policy[state]
        # reward, next_state = env.get_reward(state, now_action)
        # state_values[state] = reward + gamma * old_state_values[next_state]

    # state_values = np.max(q_table, axis=1)
    # policy = np.argmax(q_table, axis=1)
    print(f"state_values:\n{state_values}")
    env.render_policy(policy)

    delta = np.sum((old_state_values - state_values) ** 2)
    # print(f"after iter {iter} Euclidean Distance:{delta}")
    if delta < theta:
        break

print("\nenv:")
env.render_grid()
print(f"final state_values:\n{state_values}")
print(f"final policy:\n{policy}")
env.render_policy(policy)

iter 0 Euclidean Distance:25.0
state_values:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
➡️➡️➡️➡️⬇️
⬆️⏫️⏩️➡️⬇️
⬆️⬅️⏬➡️⬇️
⬆️⏩️✅⏪⬇️
⬆️⏩️⬆️⬅️⬅️

env:
⬜️⬜️⬜️⬜️⬜️
⬜️🚫🚫⬜️⬜️
⬜️⬜️🚫⬜️⬜️
⬜️🚫✅🚫⬜️
⬜️🚫⬜️⬜️⬜️
final state_values:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
final policy:
[1 1 1 1 2 0 0 1 1 2 0 3 2 1 2 0 1 4 3 2 0 1 0 3 3]
➡️➡️➡️➡️⬇️
⬆️⏫️⏩️➡️⬇️
⬆️⬅️⏬➡️⬇️
⬆️⏩️✅⏪⬇️
⬆️⏩️⬆️⬅️⬅️
