In [5]:
import gymnasium as gym
env = gym.make("ALE/KungFuMaster-v5", render_mode="rgb_array")
print(env)

<OrderEnforcing<PassiveEnvChecker<AtariEnv<ALE/KungFuMaster-v5>>>>


A.L.E: Arcade Learning Environment (version 0.11.1+2750686)
[Powered by Stella]


In [None]:
# main.py
import gymnasium as gym
import numpy as np
import cv2
from collections import deque
from memory import ReplayBuffer
from agent import DQNAgent
from gymnasium.wrappers import RecordVideo

def preprocess_frame(frame):
    # grayscale + resize + normalize
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
    return resized.astype(np.float32) / 255.0

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    if is_new_episode:
        stacked_frames = deque([np.zeros((84,84), dtype=np.float32) for _ in range(4)], maxlen=4)
        for _ in range(4):
            stacked_frames.append(frame)
    else:
        stacked_frames.append(frame)
    return np.stack(stacked_frames, axis=2), stacked_frames

def train(
    env_name="ALE/KungFuMaster-v5",
    episodes=250,
    max_steps=5000,
    buffer_size=100000,
    batch_size=32,
):
    env = RecordVideo(
    gym.make(env_name, render_mode="rgb_array"),
    video_folder="videos",
    episode_trigger=lambda ep: True,
)
    num_actions = env.action_space.n
    input_shape = (84, 84, 4)

    agent = DQNAgent(input_shape, num_actions)
    memory = ReplayBuffer(buffer_size, batch_size)

    total_step = 0
    for ep in range(1, episodes+1):
        state, _ = env.reset()
        stacked_frames = None
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        ep_reward = 0

        for t in range(max_steps):
            action = agent.act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            next_state_proc, stacked_frames = stack_frames(stacked_frames, next_state, False)

            memory.add(state, action, reward, next_state_proc, done)
            state = next_state_proc
            ep_reward += reward
            total_step += 1
            agent.step_count = total_step

            # train once buffer filled
            if len(memory) >= batch_size:
                batch = memory.sample()
                loss = agent.train_step(batch)
                agent.update_epsilon()
                agent.maybe_update_target()

            if done:
                break

        print(f"Episode: {ep}, Reward: {ep_reward:.2f}, Epsilon: {agent.epsilon:.3f}")
        
        if ep % 20 == 0:
            agent.model.save_weights(f"checkpoints/dqn_ep{ep}.weights.h5")
            agent.target_model.save_weights(f"checkpoints/dqn_ep{ep}.weights.h5")
            agent.model.save(f"checkpoints/dqn_ep{ep}.keras")

    env.close()

if __name__ == "__main__":
    train()


  logger.warn(


Episode: 1, Reward: 300.00, Epsilon: 0.999
Episode: 2, Reward: 500.00, Epsilon: 0.998
Episode: 3, Reward: 100.00, Epsilon: 0.998
Episode: 4, Reward: 700.00, Epsilon: 0.997
Episode: 5, Reward: 400.00, Epsilon: 0.996
Episode: 6, Reward: 500.00, Epsilon: 0.995
Episode: 7, Reward: 400.00, Epsilon: 0.995
Episode: 8, Reward: 0.00, Epsilon: 0.994
Episode: 9, Reward: 1500.00, Epsilon: 0.993
Episode: 10, Reward: 200.00, Epsilon: 0.992
Episode: 11, Reward: 300.00, Epsilon: 0.991
Episode: 12, Reward: 700.00, Epsilon: 0.990
Episode: 13, Reward: 900.00, Epsilon: 0.989
Episode: 14, Reward: 300.00, Epsilon: 0.989
Episode: 15, Reward: 900.00, Epsilon: 0.988
Episode: 16, Reward: 800.00, Epsilon: 0.987
Episode: 17, Reward: 300.00, Epsilon: 0.986
Episode: 18, Reward: 200.00, Epsilon: 0.985
Episode: 19, Reward: 800.00, Epsilon: 0.984
Episode: 20, Reward: 600.00, Epsilon: 0.983
Episode: 21, Reward: 300.00, Epsilon: 0.982
Episode: 22, Reward: 300.00, Epsilon: 0.982
Episode: 23, Reward: 700.00, Epsilon: 0.98

In [16]:
env = RecordVideo(
    gym.make("ALE/KungFuMaster-v5", render_mode="rgb_array"),
    video_folder="Evaluation_videos",
    episode_trigger=lambda ep: True,  # record every episode
)
num_actions = env.action_space.n
input_shape = (84, 84, 4)

# --- Load DQN Agent and Trained Weights ---
agent = DQNAgent(input_shape, num_actions)
agent.model.load_weights("checkpoints/dqn_ep240.weights.h5")
agent.target_model.load_weights("checkpoints/dqn_ep240.weights.h5")
agent.epsilon = 0.0  # pure greedy during evaluation

# --- Run Evaluation Episode ---
episodes_to_record = 10
for ep in range(episodes_to_record):
    state, _ = env.reset()
    stacked_frames = None
    state, stacked_frames = stack_frames(stacked_frames, state, True)
    done = False
    total_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        total_reward += reward

    print(f"Evaluation Episode {ep+1}: Total Reward = {total_reward:.1f}")

env.close()


  logger.warn(


Evaluation Episode 1: Total Reward = 2400.0
Evaluation Episode 2: Total Reward = 2600.0
Evaluation Episode 3: Total Reward = 2100.0
Evaluation Episode 4: Total Reward = 3100.0
Evaluation Episode 5: Total Reward = 1400.0
Evaluation Episode 6: Total Reward = 3800.0
Evaluation Episode 7: Total Reward = 3200.0
Evaluation Episode 8: Total Reward = 1700.0
Evaluation Episode 9: Total Reward = 6000.0
Evaluation Episode 10: Total Reward = 1700.0
