In [1]:
# defend the center scenario
import sys
import os
sys.path.append('..') # make sure files dont violate this convention


from envs.doom_env import VizDoomGymCorridor, DoomEpisodeLoggerCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList
from stable_baselines3 import PPO, A2C
import torch

# CONFIG STUFF HERE
MODEL_SCENARIO_NAME = 'ppo_deadly_corridor'
A2C_MODEL_SCENARIO_NAME = 'a2c_deadly_corridor'
SCENARIO_PATH = '../scenarios/deadly_corridor.cfg'
LOG_DIR = f'../logs/{MODEL_SCENARIO_NAME}/'
MODEL_DIR = f'../models/{MODEL_SCENARIO_NAME}/'

A2C_LOG_DIR = f'../logs/{A2C_MODEL_SCENARIO_NAME}/'
A2C_MODEL_DIR = f'../models/{A2C_MODEL_SCENARIO_NAME}/'
NUM_ENVS = 8
TOTAL_TIMESTEPS = 125_000
CHECKPOINT_TIMESTEPS = TOTAL_TIMESTEPS * 10 // 100

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(A2C_LOG_DIR, exist_ok=True)
os.makedirs(A2C_MODEL_DIR, exist_ok=True)

In [2]:
# seeding the training
import random
import numpy as np
from stable_baselines3.common.utils import set_random_seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
set_random_seed(SEED)

In [3]:
env = make_vec_env(
    VizDoomGymCorridor, # my doom environment
    n_envs=NUM_ENVS,
    env_kwargs=dict(
        scenario_path=SCENARIO_PATH,
        render=False,
        number_of_actions=7,
    ),
    vec_env_cls=DummyVecEnv,  # Auto-selects SubprocVecEnv (Linux) or DummyVecEnv (Windows)
    seed=SEED
)

In [4]:
# setting loggers and checkpoints for the model
checkpoint_callback = CheckpointCallback(save_freq=CHECKPOINT_TIMESTEPS // NUM_ENVS, save_path=MODEL_DIR,name_prefix=MODEL_SCENARIO_NAME)
episode_logger = DoomEpisodeLoggerCallback(log_dir=LOG_DIR, log_file=f"{MODEL_SCENARIO_NAME}_episodes_v1_{TOTAL_TIMESTEPS}.csv", verbose=0)

combined_callback = CallbackList([checkpoint_callback, episode_logger])

# creating the model
model = PPO(
        "CnnPolicy",
        env,
        verbose=1,
        tensorboard_log=LOG_DIR,
        device="cuda" if torch.cuda.is_available() else "cpu",
        learning_rate=2.5e-4,
        n_steps=4096,
        batch_size=2048,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.1,
        ent_coef=0.05,
        vf_coef=0.4,
        max_grad_norm=0.5,
        seed=SEED
    )
env.reset()

Using cuda device
Wrapping the env in a VecTransposeImage.


array([[[[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        ...,

        [[111],
         [ 91],
         [ 86],
         ...,
         [ 74],
         [ 68],
         [ 54]],

        [[104],
         [ 94],
         [ 87],
         ...,
         [ 65],
         [ 70],
         [ 46]],

        [[103],
         [ 82],
         [ 84],
         ...,
         [ 49],
         [ 51],
         [ 47]]],


       [[[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [

2.6 hours for 100k steps

10.64 fps

Updated:

48 fps

45 mins for 100k


In [5]:
model.learn(
        total_timesteps= TOTAL_TIMESTEPS // NUM_ENVS * NUM_ENVS,
        callback=combined_callback,
        progress_bar=True,
        tb_log_name=f"{MODEL_SCENARIO_NAME}_{TOTAL_TIMESTEPS}"
)

model.save(os.path.join(MODEL_DIR, f"ppo_deadly_corridor_{TOTAL_TIMESTEPS}_final"))
env.close()
print(f"\nTraining complete. Model saved to {MODEL_DIR}")

Logging to ../logs/ppo_deadly_corridor/ppo_deadly_corridor_125000_1


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 105      |
|    ep_rew_mean     | -991     |
| time/              |          |
|    fps             | 47       |
|    iterations      | 1        |
|    time_elapsed    | 687      |
|    total_timesteps | 32768    |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 129        |
|    ep_rew_mean          | -1.17e+03  |
| time/                   |            |
|    fps                  | 46         |
|    iterations           | 2          |
|    time_elapsed         | 1399       |
|    total_timesteps      | 65536      |
| train/                  |            |
|    approx_kl            | 0.17323972 |
|    clip_fraction        | 0.623      |
|    clip_range           | 0.1        |
|    entropy_loss         | -1.89      |
|    explained_variance   | -0.000536  |
|    learning_rate        | 0.00025    |
|   


Training complete. Model saved to ../models/ppo_deadly_corridor/


In [4]:
# A2C Model training setup

a2c_checkpoint_callback = CheckpointCallback(
    save_freq=CHECKPOINT_TIMESTEPS // NUM_ENVS,
    save_path=A2C_MODEL_DIR,
    name_prefix=A2C_MODEL_SCENARIO_NAME
)
a2c_episode_logger = DoomEpisodeLoggerCallback(
    log_dir=A2C_LOG_DIR,
    log_file=f"{A2C_MODEL_SCENARIO_NAME}_episodes_v1_{TOTAL_TIMESTEPS}.csv",
    verbose=0
)
a2c_combined_callback = CallbackList([a2c_checkpoint_callback, a2c_episode_logger])

# creating the model
a2c_model = A2C(
    "CnnPolicy",
    env,
    verbose=1,
    tensorboard_log=A2C_LOG_DIR,
    device="cuda" if torch.cuda.is_available() else "cpu",
    learning_rate=2.5e-4,
    n_steps=4096,        # rollout length
    gamma=0.99,
    gae_lambda=0.95,
    ent_coef=0.05,
    vf_coef=0.4,
    max_grad_norm=0.5,
    seed=SEED
)

env.reset()

Using cuda device
Wrapping the env in a VecTransposeImage.


array([[[[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        ...,

        [[111],
         [ 91],
         [ 86],
         ...,
         [ 74],
         [ 68],
         [ 54]],

        [[104],
         [ 94],
         [ 87],
         ...,
         [ 65],
         [ 70],
         [ 46]],

        [[103],
         [ 82],
         [ 84],
         ...,
         [ 49],
         [ 51],
         [ 47]]],


       [[[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [195],
         [199]],

        [[159],
         [160],
         [150],
         ...,
         [191],
         [

In [5]:
a2c_model.learn(
    total_timesteps=TOTAL_TIMESTEPS // NUM_ENVS * NUM_ENVS,
    callback=a2c_combined_callback,
    progress_bar=True,
    tb_log_name=f"{A2C_MODEL_SCENARIO_NAME}_{TOTAL_TIMESTEPS}"
)

a2c_model.save(os.path.join(A2C_MODEL_DIR, f"a2c_deadly_corridor_{TOTAL_TIMESTEPS}_final"))
env.close()
print(f"\nTraining complete. Model saved to {A2C_MODEL_DIR}")

Logging to ../logs/a2c_deadly_corridor/a2c_deadly_corridor_125000_1



Training complete. Model saved to ../models/a2c_deadly_corridor/


In [6]:
import time
# from vizdoom import ScreenResolution
# import cv2

# VIDEO_DIR = '../videos/'
# os.makedirs(VIDEO_DIR, exist_ok=True)
# VIDEO_PATH = os.path.join(VIDEO_DIR, "defend_center_ppo.mp4")

env = VizDoomGymCorridor(SCENARIO_PATH, render=True)

# model = PPO.load(f"{MODEL_DIR}ppo_deadly_corridor_125000_final")
model = A2C.load(f"{A2C_MODEL_DIR}a2c_deadly_corridor_125000_final")


for ep in range(5):
    obs, info = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        total_reward += reward
        time.sleep(0.02)
    
    # print(f"Episode {ep+1} reward: {total_reward}")
    # print(f"Video saved to {VIDEO_DIR}")

env.close()