In [None]:
import pygame
import string
import numpy as np
import random
import matplotlib.pyplot as plt

In [None]:
class QVisualizer:
    last_geometry = None  # class variable to persist size/pos across opens
    def __init__(self, reward_matrix, q_matrix):
        self.reward_matrix = reward_matrix
        self.q_matrix = q_matrix
        self.fig, self.ax = plt.subplots(figsize=(6, 6))
        self.fig.canvas.manager.set_window_title("Q-Value Matrix")

        # Restore last size/position if available
        if QVisualizer.last_geometry:
            try:
                self.fig.canvas.manager.window.setGeometry(*QVisualizer.last_geometry)
            except Exception:
                pass
        q_vals = np.max(self.q_matrix, axis=2)
        self.im = self.ax.imshow(q_vals, cmap="viridis",
                                 vmin=np.min(q_vals), vmax=np.max(q_vals))
        self.ax.set_title("Q-Value Matrix (Max per state)")
        self.texts = [[self.ax.text(j, i, f"{q_vals[i, j]:.1f}",
                         ha="center", va="center", color="black", fontsize=8)
                        for j in range(q_vals.shape[1])]
                        for i in range(q_vals.shape[0])]
        self.fig.colorbar(self.im, ax=self.ax, fraction=0.046, pad=0.04)
        self.quiver = None
        self._draw_policy_arrows()
        plt.ion()
        plt.show()
    def _policy(self):
        return np.argmax(self.q_matrix, axis=2)
    def _draw_policy_arrows(self):
        if self.quiver:
            self.quiver.remove()
        policy = self._policy()
        X, Y, U, V = [], [], [], []
        for i in range(policy.shape[0]):
            for j in range(policy.shape[1]):
                action = policy[i, j]
                # Skip terminal cells (border, BEAR, GOAL)
                if self.reward_matrix[i, j] in (-21, 19):
                    continue
                if action == 0:   # UP
                    U.append(0); V.append(-1)
                elif action == 1: # RIGHT
                    U.append(1); V.append(0)
                elif action == 2: # DOWN
                    U.append(0); V.append(1)
                elif action == 3: # LEFT
                    U.append(-1); V.append(0)
                X.append(j); Y.append(i)
        self.quiver = self.ax.quiver(X, Y, U, V, color="white",
                                     scale=3, scale_units="xy", angles="xy")
    def update(self, reward_matrix, q_matrix):
        self.reward_matrix = reward_matrix
        self.q_matrix = q_matrix
        q_vals = np.max(self.q_matrix, axis=2)
        self.im.set_data(q_vals)
        self.im.set_clim(vmin=np.min(q_vals), vmax=np.max(q_vals))
        for i in range(q_vals.shape[0]):
            for j in range(q_vals.shape[1]):
                self.texts[i][j].set_text(f"{q_vals[i, j]:.1f}")
        self._draw_policy_arrows()
        self.fig.canvas.draw()
        self.fig.canvas.flush_events()
    def close(self):
        try:
            geom = self.fig.canvas.manager.window.geometry()
            QVisualizer.last_geometry = (geom.x(), geom.y(), geom.width(), geom.height())
        except Exception:
            pass
        plt.close(self.fig)

In [None]:
class RewardVisualizer:
    last_geometry = None  # class variable to persist size/pos across opens
    def __init__(self, reward_matrix):
        self.reward_matrix = reward_matrix
        self.fig, self.ax = plt.subplots(figsize=(6, 6))
        self.fig.canvas.manager.set_window_title("Reward Matrix")

        # Restore last size/position if available
        if RewardVisualizer.last_geometry:
            try:
                self.fig.canvas.manager.window.setGeometry(*RewardVisualizer.last_geometry)
            except Exception:
                pass
        self.im = self.ax.imshow(self.reward_matrix, cmap="bwr", vmin=-21, vmax=19)
        self.ax.set_title("Reward Matrix")
        self.texts = [[self.ax.text(j, i, f"{self.reward_matrix[i, j]:.0f}",
                         ha="center", va="center", color="black", fontsize=8)
                        for j in range(self.reward_matrix.shape[1])]
                        for i in range(self.reward_matrix.shape[0])]
        self.fig.colorbar(self.im, ax=self.ax, fraction=0.046, pad=0.04)
        plt.ion()
        plt.show()
    def update(self, reward_matrix):
        self.reward_matrix = reward_matrix
        self.im.set_data(self.reward_matrix)
        for i in range(self.reward_matrix.shape[0]):
            for j in range(self.reward_matrix.shape[1]):
                self.texts[i][j].set_text(f"{self.reward_matrix[i, j]:.0f}")
        self.fig.canvas.draw()
        self.fig.canvas.flush_events()
    def close(self):
        try:
            geom = self.fig.canvas.manager.window.geometry()
            RewardVisualizer.last_geometry = (geom.x(), geom.y(), geom.width(), geom.height())
        except Exception:
            pass
        plt.close(self.fig)

--- Config ---

In [None]:
GRID_ROWS = 11
GRID_COLS = 11
MAX_WINDOW_SIZE = 900
GRID_LINE_WIDTH = 2
PADDING = 6
BG_COLOUR = (245, 245, 245)
GRID_COLOUR = (30, 30, 30)
CELLREF_COLOUR = (0, 0, 255)
COORD_COLOUR = (0, 0, 0)
BORDER_FILL_COLOUR = (180, 180, 180)
START_COLOUR = (200, 0, 0)
GOAL_COLOUR = (0, 150, 0)
STATE_COLOUR = (0, 0, 0)
REWARD_COLOUR = (255, 165, 0)
STICKMAN_COLOUR = (0, 0, 0)
VISITED_COLOUR = (0, 200, 255, 100)
START_FILL_COLOUR = (255, 200, 200)

--- Dynamic sizing ---

In [None]:
CELL_SIZE = MAX_WINDOW_SIZE // max(GRID_ROWS, GRID_COLS)
LEGEND_HEIGHT = 120
WINDOW_SIZE = (GRID_COLS * CELL_SIZE, GRID_ROWS * CELL_SIZE + LEGEND_HEIGHT)

--- Fonts ratios ---

In [None]:
FONT_RATIO_MAIN = 0.25
FONT_RATIO_COORD = 0.2
FONT_RATIO_LABEL = 0.35
FONT_RATIO_STATE = 0.25
FONT_RATIO_REWARD = 0.2

In [None]:
CELL_STATES = ["", "WOODS", "BEAR"]

Q-learning params

In [None]:
ALPHA = 0.1
GAMMA = 0.9
STEP_DELAY = 300
BACKTRACK_PENALTY = -2

Epsilon-greedy params

In [None]:
EPSILON = 1.0
EPSILON_MIN = 0.05
EPSILON_DECAY = 0.995

Reward shaping

In [None]:
GOAL_POS = (GRID_ROWS - 2, GRID_COLS - 2)
SHAPING_BONUS = 0.5

Actions (row, col): UP, RIGHT, DOWN, LEFT

In [None]:
ACTIONS = [(-1,0), (0,1), (1,0), (0,-1)]

--- Helpers ---

In [None]:
def is_border_cell(row, col):
    return row == 0 or row == GRID_ROWS - 1 or col == 0 or col == GRID_COLS - 1

In [None]:
def generate_letters(count):
    alphabet = string.ascii_uppercase
    result = []
    for i in range(count):
        n = i
        name = ""
        while True:
            name = alphabet[n % 26] + name
            n = n // 26 - 1
            if n < 0:
                break
        result.append(name)
    return result

In [None]:
def get_reward(is_border, letter, state_index, inner_letters):
    if is_border:
        return -21
    if letter == "A":
        return -1
    if letter == inner_letters[-1]:
        return 19
    if state_index == 0:
        return -1
    if state_index == 1:
        return -4
    if state_index == 2:
        return -21

In [None]:
def build_reward_matrix(inner_letters, cell_states):
    R = np.zeros((GRID_ROWS, GRID_COLS))
    for row in range(GRID_ROWS):
        for col in range(GRID_COLS):
            if is_border_cell(row, col):
                R[row, col] = -21
            else:
                letter = inner_letters[(row - 1) * (GRID_COLS - 2) + (col - 1)]
                state_index = cell_states[(row, col)]
                R[row, col] = get_reward(False, letter, state_index, inner_letters)
    return R

In [None]:
def is_terminal_cell(row, col, cell_states, inner_letters):
    if is_border_cell(row, col):
        return True
    if (row, col) in cell_states and cell_states[(row, col)] == 2:
        return True
    if not is_border_cell(row, col):
        letter = inner_letters[(row - 1) * (GRID_COLS - 2) + (col - 1)]
        if letter == inner_letters[-1]:
            return True
    return False

In [None]:
def draw_stickman(screen, row, col):
    cx = col * CELL_SIZE + CELL_SIZE // 2
    cy = (row + 1) * CELL_SIZE - PADDING - (CELL_SIZE // 8)
    head_radius = max(3, CELL_SIZE // 20)
    pygame.draw.circle(screen, STICKMAN_COLOUR, (cx, cy - head_radius*2), head_radius, 2)
    pygame.draw.line(screen, STICKMAN_COLOUR, (cx, cy - head_radius), (cx, cy + head_radius*2), 2)
    pygame.draw.line(screen, STICKMAN_COLOUR, (cx, cy), (cx - head_radius*2, cy + head_radius), 2)
    pygame.draw.line(screen, STICKMAN_COLOUR, (cx, cy), (cx + head_radius*2, cy + head_radius), 2)
    pygame.draw.line(screen, STICKMAN_COLOUR, (cx, cy + head_radius*2), (cx - head_radius, cy + head_radius*4), 2)
    pygame.draw.line(screen, STICKMAN_COLOUR, (cx, cy + head_radius*2), (cx + head_radius, cy + head_radius*4), 2)

--- Main ---

In [None]:
def main():
    global STEP_DELAY, EPSILON
    pygame.init()
    pygame.display.set_caption("PATH FINDER")
    screen = pygame.display.set_mode(WINDOW_SIZE)
    clock = pygame.time.Clock()

    # Fonts
    font_main   = pygame.font.SysFont(None, max(8, int(CELL_SIZE * FONT_RATIO_MAIN)))
    font_coord  = pygame.font.SysFont(None, max(8, int(CELL_SIZE * FONT_RATIO_COORD)))
    font_label  = pygame.font.SysFont(None, max(10, int(CELL_SIZE * FONT_RATIO_LABEL)))
    font_state  = pygame.font.SysFont(None, max(8, int(CELL_SIZE * FONT_RATIO_STATE)))
    font_reward = pygame.font.SysFont(None, max(8, int(CELL_SIZE * FONT_RATIO_REWARD)))
    font_legend = pygame.font.SysFont(None, 22)
    inner_count = (GRID_ROWS - 2) * (GRID_COLS - 2)
    inner_letters = generate_letters(inner_count)
    cell_states = {(row, col): 0 for row in range(1, GRID_ROWS - 1) for col in range(1, GRID_COLS - 1)}
    agent_row, agent_col = 1, 1
    prev_row, prev_col   = None, None
    reward_matrix = build_reward_matrix(inner_letters, cell_states)
    q_matrix      = np.zeros((GRID_ROWS, GRID_COLS, len(ACTIONS)), dtype=float)
    reward_vis = None
    q_vis = None
    simulate = False
    paused = False
    steps = 0
    episodes = 0
    max_steps = (GRID_ROWS - 2) * (GRID_COLS - 2)
    visited_cells = set()
    reset_prompt = False
    running = True
    while running:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False
            elif event.type == pygame.KEYDOWN:
                if reset_prompt:
                    if event.key == pygame.K_y:
                        q_matrix.fill(0.0)
                        EPSILON = 1.0
                        episodes = 0
                        steps = 0
                        agent_row, agent_col = 1, 1
                        prev_row, prev_col = None, None
                        visited_cells.clear()
                        reset_prompt = False
                        paused = False
                    elif event.key == pygame.K_n:
                        reset_prompt = False
                        paused = False
                    continue
                if event.key == pygame.K_t:
                    if reward_vis is None:
                        reward_vis = RewardVisualizer(reward_matrix)
                    else:
                        reward_vis.close()
                        reward_vis = None
                elif event.key == pygame.K_q:
                    if q_vis is None:
                        q_vis = QVisualizer(reward_matrix, q_matrix)
                    else:
                        q_vis.close()
                        q_vis = None
                elif event.key == pygame.K_s:
                    simulate = True
                    paused = False
                    agent_row, agent_col = 1, 1
                    prev_row, prev_col   = None, None
                    steps = 0
                    episodes = 0
                    EPSILON = 1.0
                    visited_cells.clear()
                elif event.key == pygame.K_p:
                    if simulate:
                        paused = not paused
                elif event.key in (pygame.K_PLUS, pygame.K_EQUALS):
                    STEP_DELAY = max(10, STEP_DELAY - 50)
                elif event.key in (pygame.K_MINUS, pygame.K_UNDERSCORE):
                    STEP_DELAY += 50
                elif event.key == pygame.K_r and simulate:
                    reset_prompt = True
                    paused = True
            elif event.type == pygame.MOUSEBUTTONDOWN and event.button == 1:
                mouse_x, mouse_y = event.pos
                if mouse_y < GRID_ROWS * CELL_SIZE:
                    col = mouse_x // CELL_SIZE
                    row = mouse_y // CELL_SIZE
                    if (row, col) in cell_states:
                        if not ((row, col) == (1, 1) or (row, col) == (GRID_ROWS-2, GRID_COLS-2)):
                            cell_states[(row, col)] = (cell_states[(row, col)] + 1) % len(CELL_STATES)
                            reward_matrix = build_reward_matrix(inner_letters, cell_states)

        # --- Simulation step ---
        if simulate and not paused and not reset_prompt:
            pygame.time.wait(STEP_DELAY)
            if random.random() < EPSILON:
                action = random.choice(range(len(ACTIONS)))
            else:
                q_values = q_matrix[agent_row, agent_col]
                max_q = np.max(q_values)
                best_actions = [i for i, q in enumerate(q_values) if q == max_q]
                action = random.choice(best_actions)
            d_row, d_col = ACTIONS[action]
            new_row, new_col = agent_row + d_row, agent_col + d_col
            if not (0 <= new_row < GRID_ROWS and 0 <= new_col < GRID_COLS):
                agent_row, agent_col = 1, 1
                prev_row, prev_col   = None, None
                steps = 0
                episodes += 1
                visited_cells.clear()
                if EPSILON > EPSILON_MIN:
                    EPSILON *= EPSILON_DECAY
            else:
                border = is_border_cell(new_row, new_col)
                if not border:
                    letter = inner_letters[(new_row - 1) * (GRID_COLS - 2) + (new_col - 1)]
                    state_index = cell_states[(new_row, new_col)]
                    reward = get_reward(False, letter, state_index, inner_letters)
                else:
                    reward = -21
                terminal = is_terminal_cell(new_row, new_col, cell_states, inner_letters)
                if prev_row is not None and (new_row, new_col) == (prev_row, prev_col):
                    reward += BACKTRACK_PENALTY
                prev_dist = abs(agent_row - GOAL_POS[0]) + abs(agent_col - GOAL_POS[1])
                new_dist  = abs(new_row  - GOAL_POS[0]) + abs(new_col  - GOAL_POS[1])
                if new_dist < prev_dist:
                    reward += SHAPING_BONUS
                old_q = q_matrix[agent_row, agent_col, action]
                if terminal:
                    target = reward
                else:
                    next_max_q = np.max(q_matrix[new_row, new_col])
                    target = reward + GAMMA * next_max_q
                q_matrix[agent_row, agent_col, action] = old_q + ALPHA * (target - old_q)
                prev_row, prev_col = agent_row, agent_col
                agent_row, agent_col = new_row, new_col
                steps += 1
                visited_cells.add((agent_row, agent_col))
                if terminal or steps >= max_steps:
                    pygame.time.wait(STEP_DELAY)
                    agent_row, agent_col = 1, 1
                    prev_row, prev_col   = None, None
                    steps = 0
                    episodes += 1
                    visited_cells.clear()
                    if EPSILON > EPSILON_MIN:
                        EPSILON *= EPSILON_DECAY

        # --- Drawing ---
        screen.fill(BG_COLOUR)
        for row in range(GRID_ROWS):
            for col in range(GRID_COLS):
                cell_x = col * CELL_SIZE
                cell_y = row * CELL_SIZE
                if (row, col) == (1, 1) and (row, col) == (agent_row, agent_col):
                    pygame.draw.rect(screen, START_FILL_COLOUR, (cell_x, cell_y, CELL_SIZE, CELL_SIZE))
                elif is_border_cell(row, col):
                    pygame.draw.rect(screen, BORDER_FILL_COLOUR, (cell_x, cell_y, CELL_SIZE, CELL_SIZE))
        for (row, col) in visited_cells:
            s = pygame.Surface((CELL_SIZE, CELL_SIZE), pygame.SRCALPHA)
            s.fill(VISITED_COLOUR)
            screen.blit(s, (col * CELL_SIZE, row * CELL_SIZE))
        inner_index = 0
        for row in range(GRID_ROWS):
            for col in range(GRID_COLS):
                cell_x = col * CELL_SIZE
                cell_y = row * CELL_SIZE
                coord_text = f"({col+1},{row+1})"
                coord_surface = font_coord.render(coord_text, True, COORD_COLOUR)
                screen.blit(coord_surface, (cell_x + PADDING, cell_y + PADDING))
                if is_border_cell(row, col):
                    text_surface = font_main.render("Z", True, CELLREF_COLOUR)
                else:
                    letter = inner_letters[inner_index]
                    text_surface = font_main.render(letter, True, CELLREF_COLOUR)
                    inner_index += 1
                screen.blit(text_surface, (cell_x + PADDING,
                                           cell_y + CELL_SIZE - PADDING - text_surface.get_height()))
                if not is_border_cell(row, col):
                    letter = inner_letters[(row - 1) * (GRID_COLS - 2) + (col - 1)]
                    if letter == "A":
                        start_surface = font_label.render("START", True, START_COLOUR)
                        screen.blit(start_surface,
                                    start_surface.get_rect(center=(cell_x + CELL_SIZE//2, cell_y + CELL_SIZE//2)))
                    elif letter == inner_letters[-1]:
                        goal_surface = font_label.render("GOAL", True, GOAL_COLOUR)
                        screen.blit(goal_surface,
                                    goal_surface.get_rect(center=(cell_x + CELL_SIZE//2, cell_y + CELL_SIZE//2)))
                    else:
                        state_index = cell_states[(row, col)]
                        state_text = CELL_STATES[state_index]
                        if state_text:
                            state_surface = font_state.render(state_text, True, STATE_COLOUR)
                            screen.blit(state_surface,
                                        state_surface.get_rect(center=(cell_x + CELL_SIZE//2, cell_y + CELL_SIZE//2)))
                if not is_border_cell(row, col):
                    letter = inner_letters[(row - 1) * (GRID_COLS - 2) + (col - 1)]
                    state_index = cell_states[(row, col)]
                    reward_value = get_reward(False, letter, state_index, inner_letters)
                else:
                    reward_value = -21
                reward_surface = font_reward.render(str(reward_value), True, REWARD_COLOUR)
                screen.blit(reward_surface,
                            reward_surface.get_rect(bottomright=(cell_x + CELL_SIZE - PADDING,
                                                                 cell_y + CELL_SIZE - PADDING)))
        draw_stickman(screen, agent_row, agent_col)
        for r in range(GRID_ROWS + 1):
            y = r * CELL_SIZE
            pygame.draw.line(screen, GRID_COLOUR, (0, y), (GRID_COLS * CELL_SIZE, y), GRID_LINE_WIDTH)
        for c in range(GRID_COLS + 1):
            x = c * CELL_SIZE
            pygame.draw.line(screen, GRID_COLOUR, (x, 0), (x, GRID_ROWS * CELL_SIZE), GRID_LINE_WIDTH)
        legend_y = GRID_ROWS * CELL_SIZE + 10
        legend_text1 = "S: Start   P: Pause/Resume   R: Reset (confirm)   +/-: Speed   T: Reward Matrix   Q: Q Matrix   Click: Toggle WOODS/BEAR"
        legend_text2 = f"Episodes: {episodes} | Epsilon: {EPSILON:.3f} | Step delay: {STEP_DELAY} ms | Simulation: {'Running' if simulate else 'Stopped'}{' (Paused)' if paused else ''}"
        screen.blit(font_legend.render(legend_text1, True, (20, 20, 20)), (10, legend_y))
        screen.blit(font_legend.render(legend_text2, True, (20, 20, 20)), (10, legend_y + 25))
        if reset_prompt:
            confirm_text = "Reset training? Press Y to confirm, N to cancel"
            screen.blit(font_legend.render(confirm_text, True, (200, 0, 0)), (10, legend_y + 50))
        pygame.display.flip()
        if reward_vis:
            reward_vis.update(reward_matrix)
        if q_vis:
            q_vis.update(reward_matrix, q_matrix)
        clock.tick(60)
    if reward_vis:
        reward_vis.close()
    if q_vis:
        q_vis.close()
    pygame.quit()

In [None]:
if __name__ == "__main__":
    main()