In [17]:
import numpy as np

# Maze and Q-table setup
maze = np.array([
    [1, 1, 1, 1, 0],
    [1, 0, 1, 1, 1],
    [1, 1, 0, 0, 1],
    [1, 1, 1, 0, 1],
    [0, 1, 1, 1, 1]
])
q_table = np.random.rand(5, 5, 4)

# Hyperparameters
alpha = 0.1  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.2  # exploration factor

moves = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Directions: up, down, left, right

# Training
for _ in range(10000):
    state = (np.random.randint(5), np.random.randint(5))
    if maze[state] == 0: continue  # Skip if starting in a wall

    while state != (4, 4):
        # Action selection
        if np.random.rand() < epsilon:
            action = np.random.randint(4)  # Random action (explore)
        else:
            action = np.argmax(q_table[state])  # Best action (exploit)

        # Determine the next state based on the action
        if action == 0:  # Up
            next_state = (state[0] - 1, state[1])
        elif action == 1:  # Down
            next_state = (state[0] + 1, state[1])
        elif action == 2:  # Left
            next_state = (state[0], state[1] - 1)
        elif action == 3:  # Right
            next_state = (state[0], state[1] + 1)

                # Check if the next state is valid (within bounds and not a wall)
        if 0 <= next_state[0] < 5 and 0 <= next_state[1] < 5 and maze[next_state] == 1:
            reward = 1 if next_state == (4, 4) else 0  # Reward is 1 if goal, else 0
            # Update Q-value using the Q-learning formula
            q_table[state][action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state][action])
            state = next_state  # Move to the next state
        else:
            # Penalty for invalid move (out of bounds or into a wall)
            q_table[state][action] -= alpha


# Display max Q-values and actions
for i in range(5):
    for j in range(5):
        print(f"State ({i}, {j}): Max Q-value = {np.max(q_table[i, j]):.2f}, Action = {np.argmax(q_table[i, j])}")


State (0, 0): Max Q-value = 0.76, Action = 3
State (0, 1): Max Q-value = 0.84, Action = 3
State (0, 2): Max Q-value = 0.93, Action = 1
State (0, 3): Max Q-value = 1.04, Action = 1
State (0, 4): Max Q-value = 0.64, Action = 1
State (1, 0): Max Q-value = 0.84, Action = 1
State (1, 1): Max Q-value = 0.75, Action = 3
State (1, 2): Max Q-value = 1.04, Action = 3
State (1, 3): Max Q-value = 1.15, Action = 3
State (1, 4): Max Q-value = 1.28, Action = 1
State (2, 0): Max Q-value = 0.93, Action = 1
State (2, 1): Max Q-value = 1.04, Action = 1
State (2, 2): Max Q-value = 0.49, Action = 1
State (2, 3): Max Q-value = 0.91, Action = 2
State (2, 4): Max Q-value = 1.42, Action = 1
State (3, 0): Max Q-value = 1.04, Action = 3
State (3, 1): Max Q-value = 1.15, Action = 3
State (3, 2): Max Q-value = 1.28, Action = 1
State (3, 3): Max Q-value = 0.90, Action = 3
State (3, 4): Max Q-value = 1.58, Action = 1
State (4, 0): Max Q-value = 0.89, Action = 3
State (4, 1): Max Q-value = 1.28, Action = 3
State (4, 