In [1]:
import numpy as np



# Define the 5x5 maze environment
maze = np.array([
    [1, 1, 1, 1, 0],
    [1, 0, 1, 1, 1],
    [1, 0, 0, 0, 1],
    [1, 1, 1, 0, 1],
    [0, 1, 1, 1, 1]
])




# Initialize the Q-table for a 5x5 grid with 4 actions
q_table = np.random.rand(5, 5, 4)  # 5x5 grid and 4 actions (up, down, left, right)




# Function to choose the action based on epsilon-greedy strategy
def choose_action(state, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(4)  # Explore: choose random action
    else:
        action = np.argmax(q_table[state[0], state[1]])  # Exploit: choose best action based on Q-values
    return action





# Function to update the Q-table
def update_q_table(state, action, reward, next_state, alpha, gamma):
    old_value = q_table[state[0], state[1], action]
    next_max = np.max(q_table[next_state[0], next_state[1]])  # Max Q-value for next state
    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
    q_table[state[0], state[1], action] = new_value





# Hyperparameters
epsilon = 0.1
alpha = 0.5
gamma = 0.9
state = (0, 0)





# Training the Q-learning agent
for i in range(10000):
    action = choose_action(state, epsilon)
    
    # Determine next state based on the chosen action
    if action == 0 and state[0] > 0 and maze[state[0]-1, state[1]] == 1:  # Up
        next_state = (state[0]-1, state[1])
    elif action == 1 and state[0] < 4 and maze[state[0]+1, state[1]] == 1:  # Down
        next_state = (state[0]+1, state[1])
    elif action == 2 and state[1] > 0 and maze[state[0], state[1]-1] == 1:  # Left
        next_state = (state[0], state[1]-1)
    elif action == 3 and state[1] < 4 and maze[state[0], state[1]+1] == 1:  # Right
        next_state = (state[0], state[1]+1)
    else:
        next_state = state  # Stay in the same state if action is invalid

    # Reward for reaching the goal
    if next_state == (4, 4):  # Assuming goal is bottom-right corner
        reward = 1
    else:
        reward = 0

    # Update Q-table with the current state, action, reward, and next state
    update_q_table(state, action, reward, next_state, alpha, gamma)
    state = next_state  # Move to the next state





# Assuming q_table is already defined as a (5, 5, 4) matrix
max_q_values = np.max(q_table, axis=2)  # Get max Q-values
actions = np.argmax(q_table, axis=2)    # Get corresponding actions




# Print only the max Q-values and actions
for i in range(5):
    for j in range(5):
        print(f"State ({i}, {j}): Max Q-value = {max_q_values[i, j]:.2f}, Action = {actions[i, j]}")


State (0, 0): Max Q-value = 0.39, Action = 1
State (0, 1): Max Q-value = 0.36, Action = 3
State (0, 2): Max Q-value = 0.36, Action = 1
State (0, 3): Max Q-value = 0.41, Action = 1
State (0, 4): Max Q-value = 0.70, Action = 2
State (1, 0): Max Q-value = 0.51, Action = 1
State (1, 1): Max Q-value = 0.75, Action = 3
State (1, 2): Max Q-value = 0.36, Action = 2
State (1, 3): Max Q-value = 0.49, Action = 2
State (1, 4): Max Q-value = 0.37, Action = 0
State (2, 0): Max Q-value = 0.54, Action = 1
State (2, 1): Max Q-value = 0.68, Action = 0
State (2, 2): Max Q-value = 0.78, Action = 3
State (2, 3): Max Q-value = 0.97, Action = 3
State (2, 4): Max Q-value = 8.93, Action = 1
State (3, 0): Max Q-value = 0.41, Action = 1
State (3, 1): Max Q-value = 0.42, Action = 0
State (3, 2): Max Q-value = 2.20, Action = 1
State (3, 3): Max Q-value = 0.64, Action = 1
State (3, 4): Max Q-value = 10.00, Action = 1
State (4, 0): Max Q-value = 0.95, Action = 0
State (4, 1): Max Q-value = 0.55, Action = 3
State (4,