In [12]:
!pip install gymnasium[atari,accept-rom-license] ale-py numpy tensorflow matplotlib opencv-python





In [13]:
import gymnasium as gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import cv2
import ale_py
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
# tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)  # Enable memory growth for GPU

In [14]:
# Hyperparameters
GAMMA = 0.99  # Discount factor
EPSILON = 1.0  # Initial exploration rate
EPSILON_MIN = 0.1  # Minimum exploration rate
EPSILON_DECAY = 0.99995  # Decay rate
LEARNING_RATE = 0.00025  # Learning rate
MEMORY_SIZE = 5000  # Experience replay buffer size
BATCH_SIZE = 64  # Batch size
TARGET_UPDATE_FREQ = 10  # Target model update frequency
EPISODES = 5000  # Total training episodes
LOAD_MODEL = False

In [15]:
# Create Pac-Man environment
env = gym.make("ALE/MsPacman-v5", render_mode="rgb_array")
state_shape = (88, 80, 1)  # Resized grayscale shape
# action_size = env.action_space.n
action_size = 5 # nope, up, right, down, left

In [16]:
# Function to preprocess frames
def preprocess_state(state):
    """Convert RGB to grayscale and resize."""
    state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)  # Convert to grayscale
    state = cv2.resize(state, (80, 88))  # Resize
    return np.expand_dims(state, axis=-1) / 255.0  # Normalize

In [17]:
# Build the DQN model with GPU optimization
def build_model():
    model = Sequential([
        Conv2D(32, (8, 8), strides=(4, 4), activation="relu", input_shape=state_shape),
        Conv2D(64, (4, 4), strides=(2, 2), activation="relu"),
        Conv2D(64, (3, 3), strides=(1, 1), activation="relu"),
        Flatten(),
        Dense(512, activation="relu"),
        Dense(action_size, activation="linear")  # Q-values output
    ])
    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss="mse")
    return model

In [18]:
# DQN Agent
class DQNAgent:
    def __init__(self):
        self.model = build_model()
        self.target_model = build_model()
        self.target_model.set_weights(self.model.get_weights())  # Sync target model
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.epsilon = EPSILON
        self.state_memory = np.zeros((MEMORY_SIZE, *state_shape), dtype=np.float32)
        self.next_state_memory = np.zeros((MEMORY_SIZE, *state_shape), dtype=np.float32)
        self.action_memory = np.zeros(MEMORY_SIZE, dtype=np.int32)
        self.reward_memory = np.zeros(MEMORY_SIZE, dtype=np.float32)
        self.done_memory = np.zeros(MEMORY_SIZE, dtype=np.bool)
        self.memory_counter = 0

    def act(self, state):
        """Choose action using ε-greedy strategy."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(action_size)  # Random action (exploration)
        q_values = self.model.predict(np.expand_dims(state, axis=0), verbose=0)
        return np.argmax(q_values[0])  # Best action (exploitation)

    def remember(self, state, action, reward, next_state, done):
        """Store experience in memory."""
        index = self.memory_counter % MEMORY_SIZE
        self.state_memory[index] = state
        self.next_state_memory[index] = next_state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.done_memory[index] = done
        self.memory_counter += 1

    @tf.function
    def replay(self):
        """Train the model using experience replay."""
        if self.memory_counter < BATCH_SIZE:
            return
        max_mem = min(self.memory_counter, MEMORY_SIZE)
        batch_indices = np.random.choice(max_mem, BATCH_SIZE, replace=False)
        states = self.state_memory[batch_indices]
        next_states = self.next_state_memory[batch_indices]
        actions = self.action_memory[batch_indices]
        rewards = self.reward_memory[batch_indices]
        dones = self.done_memory[batch_indices]

        targets = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)
        for i in range(BATCH_SIZE):
            if dones[i]:
                targets[i, actions[i]] = rewards[i]
            else:
                targets[i, actions[i]] = rewards[i] + GAMMA * np.max(next_q_values[i])

        # Train model in batches
        self.model.fit(states, targets, epochs=1, verbose=0, batch_size=BATCH_SIZE)

        if self.epsilon > EPSILON_MIN:
            self.epsilon *= EPSILON_DECAY  # Decay exploration rate

    def update_target_model(self):
        """Update target model weights."""
        self.target_model.set_weights(self.model.get_weights())

In [19]:
# Train the agent
agent = DQNAgent()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
# Visualization function for Colab
def show_frame(frame):
    plt.imshow(frame)
    plt.axis("off")
    plt.show()

In [21]:
# Training loop
def train():
    with open('training_log.txt', 'w') as f:
        f.write("Episode\tScore\tEpsilon\n")
        for episode in range(EPISODES):
            state = preprocess_state(env.reset()[0])
            done = False
            total_reward = 0

            while not done:
                action = agent.act(state)
                next_state, reward, done, _, _ = env.step(action)
                next_state = preprocess_state(next_state)
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
                agent.replay()

            # Update target network periodically
            if episode % TARGET_UPDATE_FREQ == 0:
                agent.update_target_model()

            log_print = f"{episode + 1}/{EPISODES}\t{total_reward}\t{agent.epsilon:.4f}\n"
            log = f"{episode + 1}\t{total_reward}\t{agent.epsilon:.4f}\n"
            print(log_print)
            f.write(log)

        env.close()

In [22]:
if LOAD_MODEL:
    agent.model = load_model("pacman_dqn.keras")
else:
    train()
    agent.model.save("pacman_dqn.keras")

1/5000	220.0	1.0000

2/5000	260.0	1.0000

3/5000	400.0	1.0000

4/5000	290.0	1.0000

5/5000	80.0	1.0000

6/5000	620.0	1.0000

7/5000	200.0	1.0000

8/5000	290.0	1.0000

9/5000	230.0	1.0000

10/5000	410.0	1.0000

11/5000	130.0	1.0000

12/5000	180.0	1.0000

13/5000	220.0	1.0000

14/5000	330.0	1.0000

15/5000	620.0	1.0000

16/5000	170.0	1.0000

17/5000	550.0	1.0000

18/5000	250.0	1.0000

19/5000	150.0	1.0000

20/5000	370.0	1.0000

21/5000	1060.0	1.0000

22/5000	650.0	1.0000

23/5000	180.0	1.0000

24/5000	180.0	1.0000

25/5000	400.0	1.0000

26/5000	250.0	1.0000

27/5000	230.0	1.0000

28/5000	140.0	1.0000

29/5000	170.0	1.0000

30/5000	320.0	1.0000

31/5000	200.0	1.0000

32/5000	330.0	1.0000

33/5000	260.0	1.0000

34/5000	170.0	1.0000

35/5000	220.0	1.0000

36/5000	240.0	1.0000

37/5000	220.0	1.0000

38/5000	160.0	1.0000

39/5000	190.0	1.0000

40/5000	230.0	1.0000

41/5000	380.0	1.0000

42/5000	350.0	1.0000

43/5000	160.0	1.0000

44/5000	350.0	1.0000

45/5000	250.0	1.0000

46/5000	630.0	1.000