# Design and implementation of reinforcement learning games based on Q-**Learning**

In [None]:
import pygame
import numpy as np
import random
import sys
import os
import logging
import matplotlib.pyplot as plt
from collections import deque
import seaborn as sns



***2.   Logging configuration and game constant definitions***


Configure logging to display important information during training, which is convenient for debugging and tracking.
It also defines the width, height, and grid size of the game window.
It calculates the number of grids (horizontally and vertically).
It defines common RGB color values ​​for drawing game elements and interfaces.

In [None]:
# Configuring Logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
# Game Constants
WIDTH, HEIGHT = 1000, 800
GRID_SIZE = 40
GRID_WIDTH = (WIDTH - 200) // GRID_SIZE
GRID_HEIGHT = HEIGHT // GRID_SIZE
COLORS = {
    'WHITE': (255, 255, 255),
    'BLACK': (0, 0, 0),
    'YELLOW': (255, 255, 0),
    'RED': (255, 0, 0),
    'GREEN': (0, 255, 0),
    'BROWN': (139, 69, 19),
    'GRAY': (128, 128, 128),
    'DARK_GRAY': (50, 50, 50)
}


3.   ***Pygame initialization***




Initialize Pygame, set up the game window, title and clock.
Load fonts for displaying game information and statistics.

In [None]:
# Initializing Pygame
pygame.init()
screen = pygame.display.set_mode((WIDTH, HEIGHT))
pygame.display.set_caption("Advanced Pac-Man Q-Learning")
clock = pygame.time.Clock()
font = pygame.font.Font(None, 24)
bold_font = pygame.font.Font(None, 32)

# Loading image material
try:
    player_img = pygame.image.load("player.webp")
    enemy_img = pygame.image.load("enemy.webp")
    gold_img = pygame.image.load("gold.webp")
    logging.info("Image loaded successfully!")
except Exception as e:
    logging.error(f"Image loading failed: {e}")
    player_img = pygame.Surface((GRID_SIZE, GRID_SIZE))
    player_img.fill(COLORS['YELLOW'])
    enemy_img = pygame.Surface((GRID_SIZE, GRID_SIZE))
    enemy_img.fill(COLORS['RED'])
    gold_img = pygame.Surface((GRID_SIZE, GRID_SIZE))
    gold_img.fill(COLORS['GREEN'])

# Resize an image
player_img = pygame.transform.scale(player_img, (GRID_SIZE, GRID_SIZE))
enemy_img = pygame.transform.scale(enemy_img, (GRID_SIZE, GRID_SIZE))
gold_img = pygame.transform.scale(gold_img, (GRID_SIZE, GRID_SIZE))

***4.  Core parameters of reinforcement learning***


Defines the core parameters of Q-Learning:



*   alpha: learning rate, controls the algorithm's acceptance of new information.
*   gamma: discount factor, determines the impact of future rewards on current decisions.
*   epsilon: exploration rate, used for switching between greedy strategy and exploration strategy.
*   epsilon_decay: decay coefficient of exploration rate, gradually reducing the probability of exploration.
*   batch_size: sample size for batch training.
*   target_update: update frequency of target network.

In [None]:
# Reinforcement Learning Core Parameters
alpha = 0.7  # Initial learning rate
gamma = 0.97  # discount factor
epsilon = 1.0  # initial exploration rate
epsilon_decay = 0.9995
epsilon_min = 0.02  # basic minimum value
adaptive_epsilon_min = 0.02  # dynamic minimum value
min_alpha = 0.1
batch_size = 256
target_update = 75
actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
success_threshold = 0.25
boost_factor = 1.5
FPS = 30

***5.Prioritized Experience Replay Buffer***


Implemented a prioritized experience replay buffer to store and sample experiences (state, action, reward, next state, completion flag). Sample a batch of experiences from the buffer by priority and calculate importance sampling weights.
*   Calculate sampling probability:

Priorities are raised to the power of alpha (probs = priorities^alpha) to amplify the influence of high priority experiences.

Normalize probabilities (probs /= probs.sum()) to ensure that the sum of probabilities is 1.

Sampling by probability: Use np.random.choice to select the index of experience according to the probability distribution.

*   Calculate weights:

*   Formula: weights = (N * probs)^(-beta), where N is the current size of the buffer.

Normalize weights (weights /= weights.max()) to range between [0, 1] to prevent high priority samples from dominating training.

*   Return results: sampled experience, index and weight (for subsequent Q value update).

In [None]:
# Prioritize experience replay buffer
class PrioritizedReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        self.priorities = deque(maxlen=capacity)

    def add(self, experience):
        max_prio = max(self.priorities) if self.priorities else 1.0
        self.buffer.append(experience)
        self.priorities.append(max_prio)

    def sample(self, batch_size, alpha=0.7, beta=0.6):
        probs = np.array(self.priorities) ** alpha
        probs /= probs.sum()
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[i] for i in indices]
        weights = (len(self.buffer) * probs[indices]) ** (-beta)
        weights /= weights.max()
        return samples, indices, weights

replay_buffer = PrioritizedReplayBuffer(10000)

***6.Dual Q Network Architecture***
Implemented a dual Q network architecture to reduce the overestimation problem of target Q values ​​in Q-Learning.
main is the main Q table, used to select actions.
target is the target Q table, used to calculate the target Q value.
Steps:
*   Create a priority experience replay buffer instance with a maximum capacity of 10,000.
*  Initialize the two Q tables (main and target) of the dual Q network
*   Copy the parameters of the main Q table (main) to the target Q table (target).

*  Effect:
Reduce the overestimation problem of Q values. Traditional Q learning uses a single network to select and evaluate actions at the same time, which is prone to overestimating action values. The dual Q network significantly reduces the estimation bias by separating the selection and evaluation processes.

In [None]:
# Dual Q network architecture
class DoubleQTable:
    def __init__(self, state_dims, action_size):
        self.main = np.random.uniform(-0.1, 0.1, state_dims + (action_size,))
        self.target = np.copy(self.main)

    def update_target(self):
        self.target = np.copy(self.main)

***7.Information panel class***
*  Displays key information during training, such as the current episode, score, epsilon, number of steps, average change in Q value, and success rate.
*  Improve observability and visualization of the training process.

In [None]:
# Information panel class
class InfoPanel:
    def __init__(self):
        self.width = 200
        self.height = HEIGHT
        self.x = WIDTH - self.width
        self.y = 0

    def draw(self, screen, episode, score, epsilon, steps, avg_q, success_count):
        pygame.draw.rect(screen, COLORS['DARK_GRAY'], (self.x, self.y, self.width, self.height))
        title = bold_font.render("Game Info", True, COLORS['WHITE'])
        screen.blit(title, (self.x + 10, 20))

        y_offset = 60
        info_items = [
            ("Episode", episode),
            ("Score", f"{score:.1f}"),
            ("Epsilon", f"{epsilon:.4f}"),
            ("Steps", steps),
            ("Avg QΔ", f"{avg_q:.2f}"),
            ("Success", sum(success_count))
        ]
        for label, value in info_items:
            text = font.render(f"{label}: {value}", True, COLORS['WHITE'])
            screen.blit(text, (self.x + 10, y_offset))
            y_offset += 35

        logs_title = bold_font.render("Recent Logs", True, COLORS['WHITE'])
        screen.blit(logs_title, (self.x + 10, HEIGHT - 160))

        log_y = HEIGHT - 120
        for msg in log_messages:
            log_text = font.render(msg, True, COLORS['WHITE'])
            screen.blit(log_text, (self.x + 10, log_y))
            log_y += 20

In [None]:
# Maze Generation
def generate_maze(width, height):
    maze = []
    for y in range(height):
        row = []
        for x in range(width):
            if x == 0 or x == width - 1 or y == 0 or y == height - 1:
                row.append('#')
            else:
                row.append('.' if random.random() > 0.1 else '#')
        maze.append(''.join(row))
    return maze

maze = generate_maze(GRID_WIDTH, GRID_HEIGHT)
obstacles = [(x, y) for y, row in enumerate(maze) for x, char in enumerate(row) if char == '#']


***9.Pac-Man class***
*  Initialization method: Initialize Pac-Man's initial state and history.

self.reset(): Call the reset method to set the initial position and visit record.

self.path_history: Record the movement path of the last 5 steps (double-ended queue, automatically eliminate old data).

self.visited_grids: A two-dimensional array that records the number of visits to each grid (used for exploration reward calculation).

*  Reset method function: Reset Pac-Man to the center of the maze and clear the visit record.

Initial position: (GRID_WIDTH//2, GRID_HEIGHT//2), which is the center of the maze.

*  Move method function: Update Pac-Man's position according to the action, and record the path and number of visits.

   Logic:

Calculate the new coordinates (new_x, new_y), and check whether it conflicts with obstacles (is_obstacle).

Update the position self.x, self.y.

Record the new position to the visited collection and path_history queue.

Update the number of visits to the corresponding grid in visited_grids.

*  State acquisition method
Returns the 8-dimensional discrete representation of the current state for Q-Learning input. Through discretization and feature compression, the original state space (such as 20x15 grid → 300 possibilities) is greatly reduced to a trainable scale.

In [None]:
def is_obstacle(x, y):
    return (x, y) in obstacles

# Pac-Man class
class Pacman:
    def __init__(self):
        self.reset()
        self.path_history = deque(maxlen=5)
        self.visited_grids = np.zeros((GRID_WIDTH, GRID_HEIGHT))  # Added access statistics matrix

    def reset(self):
        self.x, self.y = GRID_WIDTH // 2, GRID_HEIGHT // 2
        self.visited = set()
        self.visited.add((self.x, self.y))

    def move(self, action):
        new_x, new_y = self.x, self.y
        if action == 'UP' and not is_obstacle(self.x, self.y - 1):
            new_y -= 1
        elif action == 'DOWN' and not is_obstacle(self.x, self.y + 1):
            new_y += 1
        elif action == 'LEFT' and not is_obstacle(self.x - 1, self.y):
            new_x -= 1
        elif action == 'RIGHT' and not is_obstacle(self.x + 1, self.y):
            new_x += 1

        self.x, self.y = new_x, new_y
        self.visited.add((new_x, new_y))
        self.path_history.append((new_x, new_y))
        self.visited_grids[self.x][self.y] += 1  # Update Visit Count

    def get_state(self):
        px, py = self.x, self.y
        return (
            px % 3,
            py % 3,
            int(any(e.x == px for e in enemies)),
            int(any(e.y == py for e in enemies)),
            min(4, len(food.positions) // 2),
            int(px > GRID_WIDTH // 2),
            sum(1 for e in enemies if abs(e.x - px) < 3),
            min(4, sum(1 for p in food.positions if abs(p[0] - px) < 4))  # 修正括号
        )


***10.Food class: Food generation and collection***


Function: Manage the location and collection logic of food in the game.

Method details:
*  __init__ Initialization:

  Call the reset method to initialize the food location list self.positions.

*  reset Reset:

  Steps:
    Traverse all the grids of the maze and select the passable area (i.e. the position where char == '.') as the candidate food location.

    Randomly shuffle the candidate positions (random.shuffle) to ensure that the food distribution is different in each game.

    Select the first 10 positions as the food for this game (self.positions[:10]).

    Design significance:

    Every time the game is reset, the food position is randomly generated to increase the diversity of the game.

    Limit the number of food to 10 and clarify the game goal (collect all food).

*  collect Collect food:

  Input: Pac-Man's current position (x, y).

  Logic:If there is food at this position, remove the position from self.positions and return True.Otherwise return False.

  Function: Trigger reward calculation (such as bonus points after collecting food).

In [None]:
# Food class
class Food:
    def __init__(self):
        self.positions = []
        self.reset()

    def reset(self):
        self.positions = [(x, y) for y, row in enumerate(maze) for x, char in enumerate(row) if char == '.']
        random.shuffle(self.positions)
        self.positions = self.positions[:10]

    def collect(self, x, y):
        if (x, y) in self.positions:
            self.positions.remove((x, y))
            return True
        return False

***11.Enemy class: Enemy movement and behavior***

  Function: Manage enemy generation and movement strategy (combining player tracking and random movement).

Method details:
*  __init__ Initialization:

  Call the reset method to generate the enemy's initial position.

*  Reset Reset:

  Logic:

  Randomly generate the enemy's coordinates (x, y) through the while True loop until a non-obstacle position is found (is_obstacle returns False).

  Design significance: Ensure that the enemy will not be generated on the wall.

*  move_towards Move to the target:

  Input: target coordinates (target_x, target_y) (usually the position of Pac-Man).

  Calculate the horizontal distance dx and the vertical distance dy.

  Prioritize horizontal movement: If |dx| > |dy|, move horizontally one step (left/right).

  Otherwise move vertically: Move vertically one step (up/down).

  Before moving, check whether the target position is an obstacle. If not, update the coordinates.

*  smart_move Intelligent Movement:

  70% probability to track the player: call move_towards(pacman.x, pacman.y).

  30% probability to move randomly: call move_random().

  Design significance: Mixed strategies prevent the enemy's behavior from being completely predictable and increase the difficulty of the game.

*  move_random Random Movement:

  Randomly select a direction (UP/DOWN/LEFT/RIGHT).

  Check whether the target location is accessible, and update the coordinates if it is feasible.

  Purpose: Simulate the enemy's "patrol" behavior to prevent the player from easily bypassing it

In [None]:
# Enemy class
class Enemy:
    def __init__(self):
        self.reset()

    def reset(self):
        while True:
            self.x = random.randint(1, GRID_WIDTH - 2)
            self.y = random.randint(1, GRID_HEIGHT - 2)
            if not is_obstacle(self.x, self.y):
                break

    def move_towards(self, target_x, target_y):
        dx = target_x - self.x
        dy = target_y - self.y
        if abs(dx) > abs(dy):
            new_x = self.x + (1 if dx > 0 else -1)
            if not is_obstacle(new_x, self.y):
                self.x = new_x
        else:
            new_y = self.y + (1 if dy > 0 else -1)
            if not is_obstacle(self.x, new_y):
                self.y = new_y

    def smart_move(self, pacman):
        if random.random() < 0.7:
            self.move_towards(pacman.x, pacman.y)
        else:
            self.move_random()

    def move_random(self):
        action = random.choice(actions)
        new_x, new_y = self.x, self.y
        if action == 'UP' and not is_obstacle(self.x, self.y - 1):
            new_y -= 1
        elif action == 'DOWN' and not is_obstacle(self.x, self.y + 1):
            new_y += 1
        elif action == 'LEFT' and not is_obstacle(self.x - 1, self.y):
            new_x -= 1
        elif action == 'RIGHT' and not is_obstacle(self.x + 1, self.y):
            new_x += 1
        self.x, self.y = new_x, new_y


***12.Reward function***
*  Reward Shaping
  Guide agent behavior through multi-dimensional rewards: food collection > safe distance > exploration reward
  Exponential decay design makes close targets have a greater impact

  
*  State space compression
  Discretize continuous coordinates into modulo 3 values ​​to reduce state space complexity
  Handle food/enemy quantity in stages to prevent dimensionality explosion


*  Dynamic difficulty adjustment
  The less remaining food, the higher the reward, which encourages rapid collection in the later stage
  Enemy distance penalty increases nonlinearly with proximity


*  Exploration incentive mechanism
  New area reward is negatively correlated with global exploration ratio to avoid over-exploration in the later stage
  Path diversity reward prevents local loops

In [None]:
# Reward function
def calculate_reward(pacman, food, enemies):
    reward = 2  # Basic Movement Rewards
    px, py = pacman.x, pacman.y
    remaining = len(food.positions)

    if food.collect(px, py):
        base = 300 + 50 * (10 - remaining)
        if remaining < 5: base *= 3
        reward += base

    enemy_dist = min((abs(e.x - px) + abs(e.y - py) for e in enemies)) if enemies else 15
    gold_dist = min((abs(p[0] - px) + abs(p[1] - py) for p in food.positions)) if food.positions else 15

    reward += 50 * np.exp(-gold_dist / 3)
    reward -= 40 * np.exp(-enemy_dist / 1.5)

    visit_ratio = len(pacman.visited) / 200
    if (px, py) not in pacman.visited:
        reward += 35 * (1 - visit_ratio) ** 2
    else:
        reward -= max(1, 3 * visit_ratio)

    if len(pacman.path_history) >= 3:
        if pacman.path_history[-1] != pacman.path_history[-3]:
            reward += 10

    if enemy_dist < 6:
        reward -= 60 * (6 - enemy_dist) ** 1.5

    if not food.positions and (px, py) == (GRID_WIDTH // 2, GRID_HEIGHT // 2):
        reward += 800

    return reward

***13.Game object initialization***


  *  Pacman includes location, movement logic and status records (such as access history).
  *  Food manages food location and provides reset and collection methods.
  *  Enemy implements a mixed strategy with a 70% probability of tracking Pacman and a 30% random movement.
  *  InfoPanel displays training indicators such as round number, score, exploration rate, etc. in real time.

Q-learning parameter settings

In [None]:
# Game object initialization
pacman = Pacman()
food = Food()
enemies = [Enemy() for _ in range(2)]
info_panel = InfoPanel()

# Status Dimension
state_dims = (3, 3, 2, 2, 5, 2, 3, 5)
q_table = DoubleQTable(state_dims, len(actions))

# Training parameters
max_episodes = 1000
max_steps = 1000
episode_rewards = []
episode_avg_q = []
success_count = []
epsilon_history = []
log_messages = deque(maxlen=5)

***14.Training loop framework***


Function: Control the overall training rounds (max_episodes), reset the game environment (Pac-Man, food, enemy position) in each round.

Key parameters:

   1.  total_reward: Accumulate the reward of this round.

   2.  episode_q: Store the Q value update error of this round (for subsequent analysis).

   3.  success: Mark whether the goal is successfully completed in this round.

*   Action selection (ε-greedy strategy)

Function: According to the current exploration rate epsilon, randomly select actions (exploration) with a certain probability, otherwise select the action with the largest Q value (exploitation).

Key parameters:
epsilon: Dynamically adjusted exploration rate, initially 1.0, decays with training.

*  Environment interaction and reward calculation


Function:
Execute actions and update Pac-Man position.

The enemy moves according to the mixed strategy (70% tracking + 30% random).

Calculate instant rewards, including gold coin collection, distance penalty, exploration reward, etc.

Determine the termination condition: Fail to touch the enemy, or collect all gold coins and return to the starting point successfully.

*   Experience Replay (Prioritized Experience Replay, PER)
Function:

Store experience: Store each step (state, action, reward, new state, termination flag, TD error) in the buffer.

*   Priority sampling: Sample experience according to the priority of TD error to accelerate the learning of important experience.

Q value update: Use dual Q networks (main and target) to calculate the target value and update the main network parameters.

Key parameters:

  1.  gamma = 0.99: discount factor, balancing the importance of future rewards.

  2.  alpha: dynamically adjusted learning rate.

  3.  batch_size = 256: the number of experiences sampled each time.

In [None]:
# Optimizing the training loop
for episode in range(1, max_episodes + 1):
    pacman.reset()
    food.reset()
    for e in enemies: e.reset()
    total_reward = 0
    steps = 0
    episode_q = []
    success = 0

    while steps < max_steps:
        state = pacman.get_state()

        # ε-贪婪策略
        if random.random() < epsilon:
            action = random.choice(actions)
            action_idx = actions.index(action)
        else:
            action_idx = np.argmax(q_table.main[state])
            action = actions[action_idx]

        # Execute an action
        pacman.move(action)

        # Enemy moves
        for e in enemies: e.smart_move(pacman)

        # Calculate reward
        reward = calculate_reward(pacman, food, enemies)
        total_reward += reward

        # Get new state
        next_state = pacman.get_state()
        done = any(pacman.x == e.x and pacman.y == e.y for e in enemies)
        if not food.positions and not done:
            success = 1
            done = True

        # Store experience
        td_error = abs(reward + gamma * np.max(q_table.target[next_state]) - q_table.main[state][action_idx])
        replay_buffer.add((state, action_idx, reward, next_state, done, td_error))

        # Replay experience
        if len(replay_buffer.buffer) >= batch_size:
            batch, indices, weights = replay_buffer.sample(batch_size)
            new_priorities = []

            # Use enumerate to get index i and sample exp
            for i, exp in enumerate(batch):
                s, a_idx, r, ns, d, _ = exp

                # State clipping (make sure the dimensions are correct)
                s = tuple(
                    max(0, min(int(x), dim_size - 1))
                    for x, dim_size in zip(s, state_dims))

                ns = tuple(
                    max(0, min(int(x), dim_size - 1))
                    for x, dim_size in zip(ns, state_dims))


                # Calculate target value and TD error
                target = r + (1 - d) * gamma * np.max(q_table.target[ns])
                delta = abs(target - q_table.main[s][a_idx])

                # Use weights[i] instead of indices.index(exp)
                q_table.main[s][a_idx] += alpha * (target - q_table.main[s][a_idx]) * weights[i]

                new_priorities.append(delta + 1e-5)
                episode_q.append(delta)

                # Update priority
                for idx, prio in zip(indices, new_priorities):
                    replay_buffer.priorities[idx] = prio

        # Synchronize target network
        if steps % target_update == 0:
            q_table.update_target()

        # Rendering
        screen.fill(COLORS['GRAY'])
        for y, row in enumerate(maze):
            for x, char in enumerate(row):
                if char == '#':
                    pygame.draw.rect(screen, COLORS['BROWN'], (x * GRID_SIZE, y * GRID_SIZE, GRID_SIZE, GRID_SIZE))
        for pos in food.positions:
            screen.blit(gold_img, (pos[0] * GRID_SIZE, pos[1] * GRID_SIZE))
        screen.blit(player_img, (pacman.x * GRID_SIZE, pacman.y * GRID_SIZE))
        for e in enemies:
            screen.blit(enemy_img, (e.x * GRID_SIZE, e.y * GRID_SIZE))

        avg_q = np.mean(episode_q) if episode_q else 0
        info_panel.draw(screen, episode, total_reward, epsilon, steps, avg_q, success_count)
        pygame.display.flip()
        clock.tick(FPS)

        if done:
            break
        steps += 1

***15.Dynamic parameter adjustment***
  1.  Function:

  Learning rate adjustment: gradually reduce the learning rate through cosine annealing to balance training stability and speed.

  2.  Exploration rate adjustment:

  Basic decay: epsilon *= 0.9995, gradually reduce the exploration rate.

  Dynamic adjustment: If the success rate of the last 100 rounds is less than 25% (success_threshold = 0.25), temporarily increase the exploration rate (epsilon *= 1.3) to prevent local optimality.

In [None]:
 # Dynamic parameter adjustment
    alpha = max(min_alpha, 0.5 * (1 + np.cos(episode / 500 * np.pi)) * 0.7)
    if len(success_count) >= 100 and np.mean(success_count[-100:]) > 0.3:
        alpha *= 0.995

    # εStrategy Adjustment
    epsilon = max(adaptive_epsilon_min, epsilon * epsilon_decay)
    if episode % 100 == 0 and len(success_count) >= 100:
        recent_success = np.mean(success_count[-100:])
        if recent_success < success_threshold:
            epsilon = min(0.3, epsilon * boost_factor)
            adaptive_epsilon_min = min(0.05, adaptive_epsilon_min * 1.1)
        else:
            adaptive_epsilon_min = max(0.005, adaptive_epsilon_min * 0.95)

    if episode % 400 == 0:
        epsilon = min(0.4, max(adaptive_epsilon_min, epsilon * 1.3))


***16.Data recording and model saving***


  Function:

  Record the success status, cumulative rewards, average Q value changes and exploration rate of each round.

  Save Q table parameters (q_table_ep{episode}.npy) every 100 rounds for subsequent analysis and continued training.

In [None]:
# Recording Data
    success_count.append(success)
    episode_rewards.append(total_reward)
    episode_avg_q.append(np.mean(episode_q) if episode_q else 0)
    epsilon_history.append(epsilon)

    # Logging
    log_msg = f"Ep {episode} | Reward: {total_reward:.1f} | Avg QΔ: {episode_avg_q[-1]:.2f} | ε: {epsilon:.4f} | Success: {success}"
    logging.info(log_msg)
    log_messages.append(log_msg)

    # Save the model regularly
    if episode % 100 == 0:
        np.save(f"q_table_ep{episode}.npy", q_table.main)

***17.Visualization and Rendering***


  1.  Features:

  Use Pygame to render mazes, coins, Pac-Man, and enemies in real time.

  Information panel displays training metrics (rounds, rewards, exploration rate, etc.).

  Frame rate is locked to 30 FPS to ensure smooth interaction.
  2.  Draw four charts during the training process:


  Reward trend chart: shows the original reward and moving average.


  Q value dynamic chart: shows the change of Q value.


  Exploration rate evolution chart: shows the change of epsilon.

  
  Maze exploration heat map: shows the distribution of Pac-Man's visit times.

In [None]:
# Visualization enhancement
def plot_training():
    plt.figure(figsize=(18, 12))
    plt.style.use('seaborn-darkgrid')

    # 1. Reward Trend Chart
    plt.subplot(2, 2, 1)
    plt.plot(episode_rewards, alpha=0.3, color='#1f77b4', label='Raw')
    reward_ma = np.convolve(episode_rewards, np.ones(100)/100, mode='valid')
    plt.plot(reward_ma, color='#ff7f0e', linewidth=2, label='MA100')
    plt.title("Reward Trend", fontsize=14)
    plt.legend()
    plt.grid(alpha=0.4)

    # 2. Q value dynamic diagram
    plt.subplot(2, 2, 2)
    plt.plot(episode_avg_q, color='#2ca02c', linewidth=2)
    plt.title("Q-Value Dynamics", fontsize=14)
    plt.grid(alpha=0.4)

    # 3. Exploration rate evolution graph
    plt.subplot(2, 2, 3)
    plt.plot(epsilon_history, color='#d62728')
    plt.yscale('log')
    plt.title("Exploration Rate (ε)", fontsize=14)
    plt.grid(alpha=0.4)

    # 4. Maze exploration heat map
    plt.subplot(2, 2, 4)
    log_visited = np.log1p(pacman.visited_grids.T)
    sns.heatmap(
        log_visited,
        cmap="YlGnBu",
        annot=False,
        cbar_kws={'label': 'log(Visits + 1)'},
        square=True,
        linewidths=0.5,
        linecolor='white'
    )
    plt.title(f"Maze Exploration Heatmap (Episodes={max_episodes})", fontsize=14)
    plt.xticks(
        np.arange(0, GRID_WIDTH, 5),
        labels=np.arange(0, GRID_WIDTH, 5) * GRID_SIZE,
        rotation=45
    )
    plt.yticks(
        np.arange(0, GRID_HEIGHT, 5),
        labels=np.arange(0, GRID_HEIGHT, 5) * GRID_SIZE
    )
    plt.xlabel("X Coordinate (pixels)", fontsize=12)
    plt.ylabel("Y Coordinate (pixels)", fontsize=12)

    # Unified storage
    plt.tight_layout()
    plt.savefig('training_report_v2.png', dpi=300, bbox_inches='tight')
    plt.show()

plot_training()

# Keep Window
running = True
while running:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False
    clock.tick(5)

pygame.quit()