In [1]:
import numpy as np
from wumpusworld.simplified_wumpus_world import SimplifiedWumpusWorld

In [2]:
env = SimplifiedWumpusWorld()

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [4]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.1
exploration_decay_rate = 0.001

In [5]:
rewards_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()

    rewards_current_episode = 0

    record = (episode + 1) % 500 == 0
    record_string = ""

    for step in range(max_steps_per_episode):
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        if record:
            record_string += f"{action}"

        new_state, reward, done = env.step(action)

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

    if record:
        print(f"{episode: <4}: {record_string}")

499 : 30003
999 : 1323003331
1499: 300330
1999: 0
2499: 301003330
2999: 300330
3499: 300330
3999: 23230001330
4499: 300330
4999: 0
5499: 3003330
5999: 300330
6499: 300330
6999: 300330
7499: 302
7999: 300330
8499: 300330
8999: 300330
9499: 300330
9999: 3230001330


In [6]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("~~~~~~~Average Rewards Per Thousand Episodes~~~~~~")
for r in rewards_per_thousand_episodes:
    print(f"{count: <5}: {np.sum(r/1000)}")
    count += 1000

print()
print("~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~")
print(q_table)

~~~~~~~Average Rewards Per Thousand Episodes~~~~~~
1000 : -706.26
2000 : 64.75300000000001
3000 : 546.6800000000001
4000 : 638.825
5000 : 704.841
6000 : 722.848
7000 : 706.8839999999999
8000 : 700.804
9000 : 732.8259999999999
10000: 742.8449999999999

~~~~~~~~~~~~~~~~~~~~~~Q-Table~~~~~~~~~~~~~~~~~~~~~~
[[-1000.           935.62816434   935.62816434   946.08905489]
 [  956.655611     946.08905489   935.62816434 -1000.        ]
 [    0.             0.             0.             0.        ]
 [    0.             0.             0.             0.        ]
 [    0.             0.             0.             0.        ]
 [  967.3289       946.08905489 -1000.           967.3289    ]
 [  978.11        -998.38269073   949.79683967  -998.38269073]
 [    0.             0.             0.             0.        ]
 [ -911.3706188   -935.38918111   889.95920594   967.3289    ]
 [  956.655611     956.655611     956.655611     978.11      ]
 [-1000.           967.3289       967.3289       989.        ]
 [ 

In [13]:
from PIL import Image

In [15]:
canvas = Image.open("World.png")
agent = Image.open("rebort-scaled.png")

In [56]:
locations = {(x, y): (20 * (x + 1) + 500 * x, 1600 - (20 * (y + 1) + 500 * y)) for y in range(4) for x in range(4)}

In [62]:
images = []

agentX = 0
agentY = 0

b = canvas.copy()
b.paste(agent, locations[(agentX, agentY)], agent)
images.append(b)

state = env.reset()

rewards_current_episode = 0

for step in range(max_steps_per_episode):
    action = np.argmax(q_table[state, :])

    if action == 0:
        agentY = min(3, agentY + 1)
    elif action == 1:
        agentY = max(0, agentY - 1)
    elif action == 2:
        agentX = max(0, agentX - 1)
    else:
        agentX = min(3, agentX + 1)

    b = canvas.copy()
    b.paste(agent, locations[(agentX, agentY)], agent)
    images.append(b)

    new_state, reward, done = env.step(action)

    state = new_state
    rewards_current_episode += reward

    if done:
        break

print(f"Reward: {rewards_current_episode}")

images[0].save('images/anitest.gif',
               save_all=True,
               append_images=images[1:],
               duration=500,
               loop=0)

Reward: 995


<img src="images/anitest.gif" width="50%">