In [18]:
FNAME = "atari_empty_16x16_plain_dqn_1"
import numpy as np

import torch
import torch.nn as nn

from stable_baselines3 import DQN
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack


import gymnasium as gym
import ale_py
gym.register_envs(ale_py)

from gymnasium.wrappers import FrameStackObservation, ClipReward

from IPython import display

In [19]:
env = make_atari_env("AlienNoFrameskip-v4", n_envs=4) #seed can be used here
env = VecFrameStack(env, n_stack= 4)
eval_env = make_atari_env("AlienNoFrameskip-v4", n_envs=4) #seed can be used here, different than env's seed
eval_env = VecFrameStack(eval_env, n_stack= 4)

In [28]:
log_path = f"./logs/sb3_atari_dqn_1"
policy_kwargs = dict()
# policy_kwargs.update(num_agent=1)
# policy_kwargs.update(action_select_coef=50)

In [23]:
eval_callback = EvalCallback(env, best_model_save_path=log_path, log_path=log_path,
                             eval_freq=max(5000 // 4, 1), deterministic=True,
                             render=True)

In [24]:
timesteps = 20000
replay_ratio = 1

In [29]:
model = DQN(
    policy= "CnnPolicy", 
    env= env, 
    verbose= 1, 
    buffer_size= timesteps,
    learning_starts= 2000,
    tau= 0.005,
    train_freq= (1, "step"),
    gradient_steps= replay_ratio,
    target_update_interval= 1,
    policy_kwargs= policy_kwargs,
    tensorboard_log="./dqn_atari_logs",
    )
# need reset, reset_frequency and all_reset
model.learn(
    total_timesteps=timesteps,
    callback=eval_callback
    )

Using cuda device
Wrapping the env in a VecTransposeImage.




Logging to ./dqn_atari_logs/DQN_1




----------------------------------
| rollout/            |          |
|    exploration_rate | 0.4      |
| time/               |          |
|    episodes         | 4        |
|    fps              | 873      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1264     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.36e+03 |
|    ep_rew_mean      | 170      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 805      |
|    time_elapsed     | 2        |
|    total_timesteps  | 2316     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0424   |
|    n_updates        | 78       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.7e+03  |
|    ep_rew_mean      | 245      |
|    exploration_rat

<stable_baselines3.dqn.dqn.DQN at 0x749e947c0610>

In [30]:
env.close()
eval_env.close()

In [31]:
model.save(f"./models/{FNAME}")

In [32]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

print(f"mean_reward: {mean_reward}, std_reward:{std_reward}")

mean_reward: 302.0, std_reward:59.9666574022598
