In [1]:
import numpy as np

# Define the maze environment
maze = np.array([
    [0, 0, 0, 0, 0],
    [0, -1, -1, -1, 0],
    [0, 0, 0, 0, 0],
    [0, -1, -1, -1, 0],
    [0, 0, 0, 0, 0],
])

start_state = (0, 0)
goal_state = (4, 4)

In [2]:
# Define hyperparameters
epsilon = 0.1
learning_rate = 0.5
discount_factor = 0.9
max_episodes = 1000
max_steps = 100

In [3]:
# Initialize the Q-table
num_rows, num_cols = maze.shape
# 4 possible actions: up, down, left, right
num_actions = 4
Q = np.zeros((num_rows, num_cols, num_actions))

In [4]:
# Q-learning algorithm
for episode in range(max_episodes):
    state = start_state
    total_reward = 0
    for step in range(max_steps):
        # Choose an action using epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(num_actions)
        else:
            action = np.argmax(Q[state])

        # Take the chosen action and observe the next state and reward
        if action == 0:  # up
            next_state = (state[0] - 1, state[1])
        elif action == 1:  # down
            next_state = (state[0] + 1, state[1])
        elif action == 2:  # left
            next_state = (state[0], state[1] - 1)
        elif action == 3:  # right
            next_state = (state[0], state[1] + 1)

        # Check if the next state is valid
        if 0 <= next_state[0] < num_rows and 0 <= next_state[1] < num_cols:
            reward = maze[next_state]
        else:
            reward = -5  # penalize for hitting walls or going out of bounds

        # Update the Q-value for the previous state-action pair
        Q[state][action] = Q[state][action] + learning_rate * (
            reward + discount_factor * np.max(Q[next_state]) - Q[state][action]
        )

        # Update the current state
        state = next_state
        total_reward += reward

        # Check if the agent reached the goal state
        if state == goal_state:
            break

    print(f"Episode: {episode + 1}, Steps: {step + 1}, Total Reward: {total_reward}")

IndexError: index -6 is out of bounds for axis 0 with size 5

In [5]:
# Testing the learned policy
state = start_state
steps = 0
success_count = 0
while state != goal_state and steps < max_steps:
    action = np.argmax(Q[state])
    if action == 0:  # up
        next_state = (state[0] - 1, state[1])
    elif action == 1:  # down
        next_state = (state[0] + 1, state[1])
    elif action == 2:  # left
        next_state = (state[0], state[1] - 1)
    elif action == 3:  # right
        next_state = (state[0], state[1] + 1)

    if 0 <= next_state[0] < num_rows and 0 <= next_state[1] < num_cols:
        state = next_state
        steps += 1
        if state == goal_state:
            success_count += 1
    else:
        break

print(f"Testing Results - Steps: {steps}, Success Rate: {success_count / max_steps}")


Testing Results - Steps: 0, Success Rate: 0.0
