In [1]:
import numpy as np
import random

# Define the maze environment (5x5 grid)
maze = np.array([
    ['S', ' ', ' ', ' ', 'X'],
    ['X', 'X', ' ', 'X', ' '],
    [' ', ' ', ' ', 'X', ' '],
    [' ', 'X', ' ', ' ', ' '],
    [' ', 'X', ' ', 'X', 'G']
])
start, goal = (0, 0), (4, 4)
actions = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)}
q_table = np.zeros((*maze.shape, len(actions)))

# Q-learning parameters
alpha, gamma, epsilon = 0.1, 0.9, 1.0
episodes, max_steps = 500, 50

# Reward function
def get_reward(state):
    return 10 if state == goal else -10 if maze[state] == 'X' else -1

# Q-learning algorithm
for ep in range(episodes):
    state = start
    for _ in range(max_steps):
        # Select action (epsilon-greedy)
        if random.uniform(0, 1) < epsilon:
            action = random.choice(list(actions.keys()))
        else:
            action = list(actions.keys())[np.argmax(q_table[state])]
        
        # Take action and observe next state
        move = actions[action]
        next_state = (state[0] + move[0], state[1] + move[1])
        if not (0 <= next_state[0] < maze.shape[0] and 0 <= next_state[1] < maze.shape[1]) or maze[next_state] == 'X':
            next_state = state  # Invalid move, stay in place

        # Update Q-table
        reward = get_reward(next_state)
        best_next_q = np.max(q_table[next_state])
        q_table[state + (list(actions.keys()).index(action),)] += alpha * (reward + gamma * best_next_q - q_table[state + (list(actions.keys()).index(action),)])
        
        state = next_state
        if state == goal:
            break
    
    # Decay epsilon
    epsilon = max(0.1, epsilon * 0.99)

# Display learned path
def print_path():
    path_maze = maze.copy()
    state = start
    while state != goal:
        action = list(actions.keys())[np.argmax(q_table[state])]
        move = actions[action]
        next_state = (state[0] + move[0], state[1] + move[1])
        if next_state == state or maze[next_state] == 'X': break
        path_maze[state] = 'P'
        state = next_state
    path_maze[goal] = 'G'
    print("\nLearned path:")
    print(path_maze)

print_path()



Learned path:
[['P' 'P' 'P' ' ' 'X']
 ['X' 'X' 'P' 'X' ' ']
 [' ' ' ' 'P' 'X' ' ']
 [' ' 'X' 'P' 'P' 'P']
 [' ' 'X' ' ' 'X' 'G']]
