In [1]:
import numpy as np
import random

rows = int(input("Enter number of rows in the maze: "))
cols = int(input("Enter number of columns in the maze: "))

print("Enter the maze layout row by row (0 = free, 1 = wall):")
maze = []
for r in range(rows):
    while True:
        row = input(f"Row {r}: ").strip().split()
        if len(row) == cols and all(cell in ['0', '1'] for cell in row):
            maze.append([int(cell) for cell in row])
            break
        else:
            print(f"Invalid input. Enter exactly {cols} numbers (0 or 1).")
            
maze = np.array(maze)

def get_position(prompt):
    while True:
        try:
            pos = tuple(map(int, input(prompt).split()))
            if 0 <= pos[0] < rows and 0 <= pos[1] < cols and maze[pos[0], pos[1]] == 0:
                return pos
            else:
                print("Position invalid or is a wall.")
        except:
            print("Enter two integers separated by space.")

start = get_position("Enter start position (row col): ")
goal = get_position("Enter goal position (row col): ")

actions = ['up', 'down', 'left', 'right']
state_size = rows * cols
action_size = len(actions)

alpha = float(input("Enter learning rate alpha (e.g., 0.1): "))
gamma = float(input("Enter discount factor gamma (e.g., 0.9): "))
epsilon = float(input("Enter exploration rate epsilon (e.g., 0.2): "))
episodes = int(input("Enter number of episodes for training: "))

def state_index(pos):
    return pos[0] * cols + pos[1]

def is_valid(pos):
    r, c = pos
    return 0 <= r < rows and 0 <= c < cols and maze[r, c] == 0

def step(pos, action):
    r, c = pos
    if action == 'up': r -= 1
    elif action == 'down': r += 1
    elif action == 'left': c -= 1
    elif action == 'right': c += 1

    new_pos = (r, c)
    if not is_valid(new_pos):
        new_pos = pos
    reward = 10 if new_pos == goal else -1
    return new_pos, reward

Q = np.zeros((state_size, action_size))

for ep in range(episodes):
    pos = start
    while pos != goal:
        state = state_index(pos)
        # Epsilon-greedy action
        if random.random() < epsilon:
            action_idx = random.randint(0, action_size - 1)
        else:
            action_idx = np.argmax(Q[state])
        action = actions[action_idx]

        new_pos, reward = step(pos, action)
        new_state = state_index(new_pos)

        # Q-learning update
        Q[state, action_idx] += alpha * (reward + gamma * np.max(Q[new_state]) - Q[state, action_idx])

        pos = new_pos

print("Training completed!")

pos = start
path = [pos]
while pos != goal:
    state = state_index(pos)
    action_idx = np.argmax(Q[state])
    action = actions[action_idx]
    pos, _ = step(pos, action)
    path.append(pos)

def print_maze_path(maze, path, goal):
    maze_vis = maze.astype(str)
    for r, c in path:
        maze_vis[r, c] = '*'
    maze_vis[goal] = 'G'
    print("\nMaze with optimal path:")
    for row in maze_vis:
        print(' '.join(row))

print("Optimal path found by the agent:")
print(path)
print_maze_path(maze, path, goal)


Enter number of rows in the maze:  4
Enter number of columns in the maze:  4


Enter the maze layout row by row (0 = free, 1 = wall):


Row 0:  0 0 1 1
Row 1:  0 1 0 0
Row 2:  1 0 1 0
Row 3:  1 1 0 0
Enter start position (row col):  0 0 
Enter goal position (row col):  3 3 
Enter learning rate alpha (e.g., 0.1):  0.1
Enter discount factor gamma (e.g., 0.9):  0.9
Enter exploration rate epsilon (e.g., 0.2):  0.2
Enter number of episodes for training:  1000


KeyboardInterrupt: 