In [4]:
from grid_world import GridWorld, Action
import numpy as np
import random

In [5]:
gamma = 0.9
rows = 5
cols = 5


In [6]:
env = GridWorld(
    desc=[".....", ".##..", "..#..", ".#T#.", ".#..."],
    forbidden_score=-5,
    terminal_score=1,
)
print("init grid:")
env.render_grid()
# print("init policy:")
state_values = np.zeros(rows * cols)
q_table = np.zeros((rows * cols, len(Action)))
# policy = np.argmax(q_table, axis=1)
# env.render_policy(policy)

print("random policy:")
# policy = np.random.choice(len(Action), size=rows * cols)
policy = np.eye(cols)[np.random.randint(0, 5, size=(rows * cols))]
env.render_policy(policy)


init grid:
⬜️⬜️⬜️⬜️⬜️
⬜️🚫🚫⬜️⬜️
⬜️⬜️🚫⬜️⬜️
⬜️🚫✅🚫⬜️
⬜️🚫⬜️⬜️⬜️
random policy:
🔄⬇️➡️⬅️⬇️
⬅️⏫️⏩️⬇️⬅️
⬇️➡️⏫️🔄⬇️
➡️⏫️✅⏪➡️
⬅️⏫️➡️➡️🔄


In [10]:
traj_steps = 20000
epsilon = 0.1
q_table = np.zeros((rows * cols, len(Action)))
num_episodes = 20

for episode in range(num_episodes):
    if epsilon > 0.001:
        epsilon -= 0.001
    else:
        epsilon = 0.001

    p1 = 1 - epsilon * 4 / 5
    p0 = epsilon / 5

    print(f"traj_steps: {traj_steps}")
    print(f"epsilon:{epsilon}, p1:{p1}, p0:{p0}")

    d = {1: p1, 0: p0}
    policy_epsilon = np.vectorize(d.get)(policy)

    state = random.randint(0, env.get_state_space_size() - 1)
    action = random.randint(0, env.get_action_space_size() - 1)

    cnt = [0 for i in range(rows * cols)]
    q_table_rewards = [[0 for j in range(5)] for i in range(rows * cols)]
    q_table_nums = [[0 for j in range(5)] for i in range(rows * cols)]

    traj = env.get_traj(state, action, policy_epsilon, steps=traj_steps)

    reward = 0
    for k in range(traj_steps - 1, -1, -1):
        tmp_state, tmp_action, tmp_reward, _, __ = traj[k]

        cnt[tmp_state] += 1
        reward = reward * gamma + tmp_reward

        q_table_rewards[tmp_state][tmp_action] += reward
        q_table_nums[tmp_state][tmp_action] += 1
        q_table[tmp_state][tmp_action] = (
            q_table_rewards[tmp_state][tmp_action] / q_table_nums[tmp_state][tmp_action]
        )

    value_states = []
    for state in range(env.get_state_space_size()):
        v = 0
        for action in range(env.get_action_space_size()):
            v += policy_epsilon[state][action] * q_table[state][action]
        value_states.append(v)
    print(np.array(value_states).reshape(5,5))

    policy = np.eye(5)[np.argmax(q_table,axis=1)]
    env.render_policy(policy)


traj_steps: 20000
epsilon:0.099, p1:0.9208, p0:0.0198
[[ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.        -0.9895545  0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]]
⬆️⬆️⬆️⬆️⬆️
⬆️⏫️⏫️⬆️⬆️
⬆️⬆️⏫️⬆️⬆️
⬆️⏫️✅⏫️⬆️
⬆️⏫️⬆️⬆️⬆️
traj_steps: 20000
epsilon:0.098, p1:0.9216, p0:0.0196
[[ 0.        0.       -0.979559  0.        0.      ]
 [ 0.        0.        0.        0.        0.      ]
 [ 0.        0.        0.        0.        0.      ]
 [ 0.       -0.979559  0.        0.        0.      ]
 [ 0.        0.        0.        0.        0.      ]]
⬆️⬆️⬆️⬆️⬆️
⬆️⏫️⏫️⬆️⬆️
⬆️⬆️⏫️⬆️⬆️
⬆️⏫️✅⏫️⬆️
⬆️⏫️⬆️⬆️⬆️
traj_steps: 20000
epsilon:0.097, p1:0.9224, p0:0.0194
[[ 0.         0.        -0.9695635  0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]