In [None]:
import warnings
warnings.filterwarnings("ignore", message="Gym has been unmaintained since")
import gymnasium as gym
import numpy as np
from gymnasium import spaces

class HospitalEnv(gym.Env):
    def __init__(self, max_steps=30):
        super().__init__()

        self.max_steps = max_steps  # maximum steps per episode

        # Observation space: 10 features
        self.observation_space = spaces.Box(low=0, high=200, shape=(10,), dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # RED=0, YELLOW=1, GREEN=2

        # Doctors
        self.num_doctors = 3
        self.doctor_timers = np.zeros(self.num_doctors)

        # Queues
        self.red_queue = []
        self.yellow_queue = []
        self.green_queue = []

        # Arrival tracking
        self.arrival_caps = {"red": 3, "yellow": 5, "green": 5}
        self.arrivals_done = {"red": 0, "yellow": 0, "green": 0}

        # Service times (lognormal)
        self.red_mu, self.red_sigma = self._lognormal(20, 6)
        self.yellow_mu, self.yellow_sigma = self._lognormal(12, 4)
        self.green_mu, self.green_sigma = self._lognormal(8, 2)

        # Step counter
        self.current_step = 0

        # Track last 5 actions for GREEN reward
        self.last_5_actions = []

        # Track last served wait times per category
        self.last_served_wait_times = {"red": 0, "yellow": 0, "green": 0}

    # -------------------------------------------
    # Lognormal conversion
    # -------------------------------------------
    def _lognormal(self, mean, std):
        variance = std ** 2
        mu = np.log(mean**2 / np.sqrt(variance + mean**2))
        sigma = np.sqrt(np.log(1 + variance / (mean**2)))
        return mu, sigma

    # -------------------------------------------
    # Reset environment
    # -------------------------------------------
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.red_queue = []
        self.yellow_queue = []
        self.green_queue = []
        self.arrivals_done = {"red": 0, "yellow": 0, "green": 0}
        self.doctor_timers[:] = 0
        self.current_step = 0
        self.last_5_actions = []
        self.last_served_wait_times = {"red": 0, "yellow": 0, "green": 0}
        obs = self._get_obs()
        return obs, {}

    # -------------------------------------------
    # Observation
    # -------------------------------------------
    def _get_obs(self):
        return np.array([
            np.sum(self.doctor_timers == 0),  # free doctors
            max(self.red_queue) if self.red_queue else 0,
            max(self.yellow_queue) if self.yellow_queue else 0,
            max(self.green_queue) if self.green_queue else 0,
            len(self.red_queue),
            len(self.yellow_queue),
            len(self.green_queue),
            self.doctor_timers[0],
            self.doctor_timers[1],
            self.doctor_timers[2],
        ], dtype=np.float32)

    # -------------------------------------------
    # Sample service time
    # -------------------------------------------
    def _sample_service(self, action):
        if action == 0:
            return np.random.lognormal(self.red_mu, self.red_sigma)
        elif action == 1:
            return np.random.lognormal(self.yellow_mu, self.yellow_sigma)
        else:
            return np.random.lognormal(self.green_mu, self.green_sigma)

    # -------------------------------------------
    # Add new arrivals
    # -------------------------------------------
    def _add_new_arrivals(self):
        new_red = np.random.poisson(lam=1)
        new_yellow = np.random.poisson(lam=3)
        new_green = np.random.poisson(lam=3)

        self.red_queue.extend([0] * new_red)
        self.yellow_queue.extend([0] * new_yellow)
        self.green_queue.extend([0] * new_green)

        # Increase waiting time of all existing patients
        self.red_queue = [w + 1 for w in self.red_queue]
        self.yellow_queue = [w + 1 for w in self.yellow_queue]
        self.green_queue = [w + 1 for w in self.green_queue]

    # -------------------------------------------
    # Step function
    # -------------------------------------------
    def step(self, action):
        self.current_step += 1

        # Wait until at least one doctor is free
        free_doctors = np.where(self.doctor_timers == 0)[0]
        if len(free_doctors) == 0:
            min_timer = min([t for t in self.doctor_timers if t > 0])
            self.doctor_timers = np.maximum(0, self.doctor_timers - min_timer)
            free_doctors = np.where(self.doctor_timers == 0)[0]

        doctor = free_doctors[0]

        # Map action to queue
        queue_map = {0: ("red", self.red_queue),
                     1: ("yellow", self.yellow_queue),
                     2: ("green", self.green_queue)}
        cat_name, queue = queue_map[action]

        # If queue empty → 0 reward
        if len(queue) == 0:
            reward = 0
        else:
            wait_time = queue.pop(0)
            dt = self._sample_service(action)
            self.doctor_timers[doctor] = dt

            # Advance other doctors
            for i in range(self.num_doctors):
                if i != doctor:
                    self.doctor_timers[i] = max(0, self.doctor_timers[i] - dt)

            # Reward calculation
            reward_map = {"red": 25, "yellow": 15, "green": 5}
            reward = reward_map[cat_name] + 1

            threshold_times = {"red": 5, "yellow": 15, "green": 30}
            if wait_time <= threshold_times[cat_name]:
                reward += 5

            # --- Save last served wait time for this category ---
            self.last_served_wait_times[cat_name] = wait_time

        # GREEN patient reward for recent service (fairness)
        self.last_5_actions.append(action)
        if len(self.last_5_actions) > 5:
            self.last_5_actions.pop(0)
        if 2 in self.last_5_actions:
            reward += 8

        

        # Add new arrivals
        self._add_new_arrivals()

        # Truncated if max steps reached
        truncated = self.current_step >= self.max_steps

        return self._get_obs(), reward, False, truncated, {}


In [3]:
import warnings
warnings.filterwarnings("ignore", message="Gym has been unmaintained since")
import gymnasium as gym
import numpy as np
from gymnasium import spaces

class HospitalEnv(gym.Env):
    def __init__(self, max_steps=30):
        super().__init__()

        self.max_steps = max_steps  # maximum steps per episode

        # Observation space: 10 features
        self.observation_space = spaces.Box(low=0, high=200, shape=(10,), dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # RED=0, YELLOW=1, GREEN=2

        # Doctors
        self.num_doctors = 3
        self.doctor_timers = np.zeros(self.num_doctors)

        # Queues
        self.red_queue = []
        self.yellow_queue = []
        self.green_queue = []

        # Arrival tracking
        self.arrival_caps = {"red": 3, "yellow": 5, "green": 5}
        self.arrivals_done = {"red": 0, "yellow": 0, "green": 0}

        # Service times (lognormal)
        self.red_mu, self.red_sigma = self._lognormal(20, 6)
        self.yellow_mu, self.yellow_sigma = self._lognormal(12, 4)
        self.green_mu, self.green_sigma = self._lognormal(8, 2)

        # Step counter
        self.current_step = 0

        # Track last 5 actions for GREEN reward
        self.last_5_actions = []

        # Track last served wait times per category
        self.last_served_wait_times = {"red": 0, "yellow": 0, "green": 0}

    # -------------------------------------------
    # Lognormal conversion
    # -------------------------------------------
    def _lognormal(self, mean, std):
        variance = std ** 2
        mu = np.log(mean**2 / np.sqrt(variance + mean**2))
        sigma = np.sqrt(np.log(1 + variance / (mean**2)))
        return mu, sigma

    # -------------------------------------------
    # Reset environment
    # -------------------------------------------
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.red_queue = []
        self.yellow_queue = []
        self.green_queue = []
        self.arrivals_done = {"red": 0, "yellow": 0, "green": 0}
        self.doctor_timers[:] = 0
        self.current_step = 0
        self.last_5_actions = []
        self.last_served_wait_times = {"red": 0, "yellow": 0, "green": 0}
        obs = self._get_obs()
        return obs, {}

    # -------------------------------------------
    # Observation
    # -------------------------------------------
    def _get_obs(self):
        return np.array([
            np.sum(self.doctor_timers == 0),  # free doctors
            max(self.red_queue) if self.red_queue else 0,
            max(self.yellow_queue) if self.yellow_queue else 0,
            max(self.green_queue) if self.green_queue else 0,
            len(self.red_queue),
            len(self.yellow_queue),
            len(self.green_queue),
            self.doctor_timers[0],
            self.doctor_timers[1],
            self.doctor_timers[2],
        ], dtype=np.float32)

    # -------------------------------------------
    # Sample service time
    # -------------------------------------------
    def _sample_service(self, action):
        if action == 0:
            return np.random.lognormal(self.red_mu, self.red_sigma)
        elif action == 1:
            return np.random.lognormal(self.yellow_mu, self.yellow_sigma)
        else:
            return np.random.lognormal(self.green_mu, self.green_sigma)

    # -------------------------------------------
    # Add new arrivals
    # -------------------------------------------
    def _add_new_arrivals(self):
        new_red = np.random.poisson(lam=1)
        new_yellow = np.random.poisson(lam=3)
        new_green = np.random.poisson(lam=3)

        self.red_queue.extend([0] * new_red)
        self.yellow_queue.extend([0] * new_yellow)
        self.green_queue.extend([0] * new_green)

        # Increase waiting time of all existing patients
        self.red_queue = [w + 1 for w in self.red_queue]
        self.yellow_queue = [w + 1 for w in self.yellow_queue]
        self.green_queue = [w + 1 for w in self.green_queue]

    # -------------------------------------------
    # Step function
    # -------------------------------------------
    def step(self, action):
        self.current_step += 1

        # Wait until at least one doctor is free
        free_doctors = np.where(self.doctor_timers == 0)[0]
        if len(free_doctors) == 0:
            min_timer = min([t for t in self.doctor_timers if t > 0])
            self.doctor_timers = np.maximum(0, self.doctor_timers - min_timer)
            free_doctors = np.where(self.doctor_timers == 0)[0]

        doctor = free_doctors[0]

        # Map action to queue
        queue_map = {0: ("red", self.red_queue),
                     1: ("yellow", self.yellow_queue),
                     2: ("green", self.green_queue)}
        cat_name, queue = queue_map[action]

        # If queue empty → 0 reward
        if len(queue) == 0:
            reward = 0
        else:
            wait_time = queue.pop(0)
            dt = self._sample_service(action)
            self.doctor_timers[doctor] = dt

            # Advance other doctors
            for i in range(self.num_doctors):
                if i != doctor:
                    self.doctor_timers[i] = max(0, self.doctor_timers[i] - dt)

            # Reward calculation
            reward_map = {"red": 25, "yellow": 15, "green": 5}
            reward = reward_map[cat_name] + 1

            threshold_times = {"red": 5, "yellow": 15, "green": 30}
            if wait_time <= threshold_times[cat_name]:
                reward += 5

            # --- Save last served wait time for this category ---
            self.last_served_wait_times[cat_name] = wait_time

        # GREEN patient reward for recent service (fairness)
        self.last_5_actions.append(action)
        if len(self.last_5_actions) > 5:
            self.last_5_actions.pop(0)
        if 2 in self.last_5_actions:
            reward += 2

        

        # Add new arrivals
        self._add_new_arrivals()

        # Truncated if max steps reached
        truncated = self.current_step >= self.max_steps

        return self._get_obs(), reward, False, truncated, {}


In [1]:
import warnings
warnings.filterwarnings("ignore", message="Gym has been unmaintained since")

import sys
sys.path.append("../env")  # to read hospital_env.py
from hospital_env import HospitalEnv

import gymnasium as gym
import numpy as np
import torch

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

# ------------------------------
# Set seeds for reproducibility
# ------------------------------
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# --- Create wrapped environment ---
def make_env():
    env = HospitalEnv()
    env.reset(seed=SEED)   # seed the environment here
    env = Monitor(env)      # important for SB3 logging
    return env

env = DummyVecEnv([make_env])

# --- Create DQN agent ---
model = DQN(
    "MlpPolicy",       # Fully connected NN
    env,
    learning_rate=5e-4,
    gamma=0.95,
    batch_size=64,
    buffer_size=50000,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.1,
    exploration_fraction=0.1,   # epsilon decay
    target_update_interval=1000,
    verbose=1,
    seed=SEED                  # seed SB3 agent
)

# --- Train agent ---
model.learn(total_timesteps=50000)

# --- Save trained model ---
model.save("../models/dqn_hospital_sb3")
print("Model saved to models/dqn_hospital_sb3.zip")

# --- Evaluation ---
eval_env = DummyVecEnv([make_env])
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=50)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using cpu device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 30       |
|    ep_rew_mean      | 1.49e+03 |
|    exploration_rate | 0.978    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1740     |
|    time_elapsed     | 0        |
|    total_timesteps  | 120      |
| train/              |          |
|    learning_rate    | 0.0005   |
|    loss             | 47.6     |
|    n_updates        | 4        |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 30       |
|    ep_rew_mean      | 1.52e+03 |
|    exploration_rate | 0.957    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 765      |
|    time_elapsed     | 0        |
|    total_timesteps  | 240      |
| train/              |          |
|    learning_rate    | 0.0005   |
|    loss             | 38.2     |
|  

In [4]:
import gymnasium as gym
import numpy as np
import torch

import sys
sys.path.append("../env")  # to read hospital_env.py
from hospital_env import HospitalEnv

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

env = HospitalEnv
def check_convergence(mean_rewards, threshold=10):
    """
    Checks if the last two rewards differ by less than threshold.
    Returns True = converged.
    """
    if len(mean_rewards) < 2:
        return False
    return abs(mean_rewards[-1] - mean_rewards[-2]) < threshold

model = DQN.load("../models/dqn_hospital_sb3", env=eval_env)
mean_rewards_history = []
timesteps_history = []

for t in range(0, 200000, 10000):  # evaluate every 10k timesteps
    model = DQN.load(f"checkpoints/model_{t}.zip", env)
    
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True)
    mean_rewards_history.append(mean_reward)
    timesteps_history.append(t)

    print(f"At {t} timesteps → {mean_reward}")

    if check_convergence(mean_rewards_history):
        print(f"\nModel converged at ~{t} timesteps.\n")
        break


NameError: name 'eval_env' is not defined

In [2]:
import sys
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

# Add environment path
sys.path.append("../env")  
from hospital_env import HospitalEnv

# -------------------------------
# Create wrapped evaluation environment
# -------------------------------
def make_env():
    env = HospitalEnv()
    env = Monitor(env)  # SB3 logging wrapper
    return env

eval_env = DummyVecEnv([make_env])

# -------------------------------
# Load trained DQN model
# -------------------------------
model = DQN.load("../models/dqn_hospital_sb3", env=eval_env)

# -------------------------------
# Evaluation parameters
# -------------------------------
n_episodes = 10  # number of episodes to evaluate
threshold_times = {"red": 5, "yellow": 15, "green": 30}

# Storage for metrics
rewards_per_episode = []
red_waits, yellow_waits, green_waits = [], [], []
queue_lengths = {"red": [], "yellow": [], "green": []}
green_fair_actions = []

# -------------------------------
# Evaluate model
# -------------------------------
for ep in range(n_episodes):
    obs = eval_env.reset()  # only 1 value returned
    done = False
    episode_reward = 0

    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)  # SB3 returns 4 values
        episode_reward += reward[0]

        # Access raw environment for metrics
        env = eval_env.envs[0].unwrapped

        # Record last served wait times
        red_waits.append(env.last_served_wait_times["red"])
        yellow_waits.append(env.last_served_wait_times["yellow"])
        green_waits.append(env.last_served_wait_times["green"])

        # Record queue lengths
        queue_lengths["red"].append(len(env.red_queue))
        queue_lengths["yellow"].append(len(env.yellow_queue))
        queue_lengths["green"].append(len(env.green_queue))

        # GREEN fairness: if GREEN patient served in last 5 actions
        green_fair_actions.append(1 if 2 in env.last_5_actions else 0)

    rewards_per_episode.append(episode_reward)

# -------------------------------
# Compute metrics
# -------------------------------
avg_reward = np.mean(rewards_per_episode)
avg_wait_red = np.mean(red_waits) if red_waits else 0
avg_wait_yellow = np.mean(yellow_waits) if yellow_waits else 0
avg_wait_green = np.mean(green_waits) if green_waits else 0

red_served = len(red_waits)
yellow_served = len(yellow_waits)
green_served = len(green_waits)

pct_red_within = 100 * sum(w <= threshold_times["red"] for w in red_waits) / red_served if red_served else 0
pct_yellow_within = 100 * sum(w <= threshold_times["yellow"] for w in yellow_waits) / yellow_served if yellow_served else 0
pct_green_within = 100 * sum(w <= threshold_times["green"] for w in green_waits) / green_served if green_served else 0

queue_stats = {cat: {"avg": np.mean(qs), "max": np.max(qs)} for cat, qs in queue_lengths.items()}

fairness = np.mean(green_fair_actions) if green_fair_actions else 0

# -------------------------------
# Print results with 2 decimals
# -------------------------------
print(f"Average reward per episode: {avg_reward:.2f}")
print(f"Average wait times (Red, Yellow, Green): {avg_wait_red:.2f}, {avg_wait_yellow:.2f}, {avg_wait_green:.2f}")
print(f"Percentage served within thresholds (Red, Yellow, Green): {pct_red_within:.2f}%, {pct_yellow_within:.2f}%, {pct_green_within:.2f}%")
print("Queue stats (average and max lengths):")
for cat, stats in queue_stats.items():
    print(f"  {cat.capitalize()}: avg={stats['avg']:.2f}, max={stats['max']:.2f}")
print(f"Green fairness metric: {fairness:.2f}")


Average reward per episode: 1450.20
Average wait times (Red, Yellow, Green): 2.70, 3.35, 0.69
Percentage served within thresholds (Red, Yellow, Green): 87.67%, 100.00%, 100.00%
Queue stats (average and max lengths):
  Red: avg=2.98, max=13.00
  Yellow: avg=40.36, max=100.00
  Green: avg=44.44, max=102.00
Green fairness metric: 0.19


In [9]:
import sys
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

# Add environment path
sys.path.append("../env")  
from hospital_env import HospitalEnv

# -------------------------------
# Create wrapped evaluation environment
# -------------------------------
def make_env():
    env = HospitalEnv()
    env = Monitor(env)  # SB3 logging wrapper
    return env

eval_env = DummyVecEnv([make_env])

# -------------------------------
# Load trained DQN model
# -------------------------------
model = DQN.load("../models/dqn_hospital_sb3", env=eval_env)

# -------------------------------
# Evaluation parameters
# -------------------------------
n_episodes = 10  # number of episodes to evaluate
threshold_times = {"red": 5, "yellow": 15, "green": 30}

# Storage for metrics
rewards_per_episode = []
red_waits, yellow_waits, green_waits = [], [], []
queue_lengths = {"red": [], "yellow": [], "green": []}
green_fair_actions = []

# -------------------------------
# Evaluate model
# -------------------------------
for ep in range(n_episodes):
    obs, _ = eval_env.reset()
    done = False
    episode_reward = 0

    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = eval_env.step(action)
        episode_reward += reward[0]

        # Access raw environment for metrics
        env = eval_env.envs[0].unwrapped

        # Record last served wait times only if a patient was actually served
        served = False
        queue_map = {0: env.red_queue, 1: env.yellow_queue, 2: env.green_queue}
        if len(queue_map[action]) < len(queue_map[action]) + 1:  # patient removed
            served = True

        if served:
            if action == 0:
                red_waits.append(env.last_served_wait_times["red"])
            elif action == 1:
                yellow_waits.append(env.last_served_wait_times["yellow"])
            elif action == 2:
                green_waits.append(env.last_served_wait_times["green"])

        # Record queue lengths every step
        queue_lengths["red"].append(len(env.red_queue))
        queue_lengths["yellow"].append(len(env.yellow_queue))
        queue_lengths["green"].append(len(env.green_queue))

        # Track GREEN fairness (if GREEN patient served recently)
        green_fair_actions.append(1 if 2 in env.last_5_actions else 0)

        done = terminated or truncated

    rewards_per_episode.append(episode_reward)

# -------------------------------
# Compute metrics
# -------------------------------
avg_reward = np.mean(rewards_per_episode)
avg_wait_red = np.mean(red_waits) if red_waits else 0
avg_wait_yellow = np.mean(yellow_waits) if yellow_waits else 0
avg_wait_green = np.mean(green_waits) if green_waits else 0

red_served = len(red_waits)
yellow_served = len(yellow_waits)
green_served = len(green_waits)

pct_red_within = 100 * sum(w <= threshold_times["red"] for w in red_waits) / red_served if red_served else 0
pct_yellow_within = 100 * sum(w <= threshold_times["yellow"] for w in yellow_waits) / yellow_served if yellow_served else 0
pct_green_within = 100 * sum(w <= threshold_times["green"] for w in green_waits) / green_served if green_served else 0

queue_stats = {cat: {"avg": np.mean(qs), "max": np.max(qs)} for cat, qs in queue_lengths.items()}
fairness = np.mean(green_fair_actions) if green_fair_actions else 0

# -------------------------------
# Print results with 2 decimals
# -------------------------------
print(f"Average reward per episode: {avg_reward:.2f}")
print(f"Average wait times (Red, Yellow, Green): {avg_wait_red:.2f}, {avg_wait_yellow:.2f}, {avg_wait_green:.2f}")
print(f"Percentage served within thresholds (Red, Yellow, Green): {pct_red_within:.2f}%, {pct_yellow_within:.2f}%, {pct_green_within:.2f}%")
print("Queue stats (average and max lengths):")
for cat, stats in queue_stats.items():
    print(f"  {cat.capitalize()}: avg={stats['avg']:.2f}, max={stats['max']:.2f}")
print(f"Green fairness metric: {fairness:.2f}")


ValueError: not enough values to unpack (expected 2, got 1)

In [10]:
import sys
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

# Add environment path
sys.path.append("../env")  
from hospital_env import HospitalEnv

# -------------------------------
# Create wrapped evaluation environment
# -------------------------------
def make_env():
    env = HospitalEnv()
    env = Monitor(env)  # SB3 logging wrapper
    return env

eval_env = DummyVecEnv([make_env])

# -------------------------------
# Load trained DQN model
# -------------------------------
model = DQN.load("../models/dqn_hospital_sb3", env=eval_env)

# -------------------------------
# Evaluation parameters
# -------------------------------
n_episodes = 10  # number of episodes to evaluate
threshold_times = {"red": 5, "yellow": 15, "green": 30}

# Storage for metrics
rewards_per_episode = []
red_waits, yellow_waits, green_waits = [], [], []
queue_lengths = {"red": [], "yellow": [], "green": []}
green_fair_actions = []

# -------------------------------
# Evaluate model
# -------------------------------
for ep in range(n_episodes):
    obs = eval_env.reset()  # only obs returned
    done = False
    episode_reward = 0

    while not done:
        # Predict action
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)  # 4 values for SB3
        episode_reward += reward[0]

        # Access raw environment for metrics
        env = eval_env.envs[0].unwrapped

        # Record last served wait times (only if patient served)
        if env.last_served_wait_times["red"] > 0:
            red_waits.append(env.last_served_wait_times["red"])
        if env.last_served_wait_times["yellow"] > 0:
            yellow_waits.append(env.last_served_wait_times["yellow"])
        if env.last_served_wait_times["green"] > 0:
            green_waits.append(env.last_served_wait_times["green"])

        # Record queue lengths
        queue_lengths["red"].append(len(env.red_queue))
        queue_lengths["yellow"].append(len(env.yellow_queue))
        queue_lengths["green"].append(len(env.green_queue))

        # Track GREEN fairness (if GREEN patient served recently)
        green_fair_actions.append(1 if 2 in env.last_5_actions else 0)

    rewards_per_episode.append(episode_reward)

# -------------------------------
# Compute metrics
# -------------------------------
avg_reward = np.mean(rewards_per_episode)
avg_wait_red = np.mean(red_waits) if red_waits else 0
avg_wait_yellow = np.mean(yellow_waits) if yellow_waits else 0
avg_wait_green = np.mean(green_waits) if green_waits else 0

red_served = len(red_waits)
yellow_served = len(yellow_waits)
green_served = len(green_waits)

pct_red_within = 100 * sum(w <= threshold_times["red"] for w i_*


SyntaxError: '(' was never closed (983217891.py, line 85)

In [3]:
import sys
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

# Add environment path
sys.path.append("../env")  
from hospital_env import HospitalEnv

# -------------------------------
# Create wrapped evaluation environment
# -------------------------------
def make_env():
    env = HospitalEnv()
    env = Monitor(env)  # SB3 logging wrapper
    return env

eval_env = DummyVecEnv([make_env])

# -------------------------------
# Load trained DQN model
# -------------------------------
model = DQN.load("../models/dqn_hospital_sb3", env=eval_env)

# -------------------------------
# Evaluation parameters
# -------------------------------
n_episodes = 10  # number of episodes to evaluate
threshold_times = {"red": 5, "yellow": 15, "green": 30}

# Storage for metrics
rewards_per_episode = []
red_waits, yellow_waits, green_waits = [], [], []
queue_lengths = {"red": [], "yellow": [], "green": []}
green_fair_actions = []

# -------------------------------
# Evaluate model
# -------------------------------
for ep in range(n_episodes):
    obs = eval_env.reset()  # only obs returned
    done = False
    episode_reward = 0

    while not done:
        # Predict action
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)  # 4 values for SB3
        episode_reward += reward[0]

        # Access raw environment for metrics
        env = eval_env.envs[0].unwrapped

        # Record last served wait times (only if patient served)
        if env.last_served_wait_times["red"] > 0:
            red_waits.append(env.last_served_wait_times["red"])
        if env.last_served_wait_times["yellow"] > 0:
            yellow_waits.append(env.last_served_wait_times["yellow"])
        if env.last_served_wait_times["green"] > 0:
            green_waits.append(env.last_served_wait_times["green"])

        # Record queue lengths
        queue_lengths["red"].append(len(env.red_queue))
        queue_lengths["yellow"].append(len(env.yellow_queue))
        queue_lengths["green"].append(len(env.green_queue))

        # Track GREEN fairness (if GREEN patient served recently)
        green_fair_actions.append(1 if 2 in env.last_5_actions else 0)

    rewards_per_episode.append(episode_reward)

# -------------------------------
# Compute metrics
# -------------------------------
avg_reward = np.mean(rewards_per_episode)
avg_wait_red = np.mean(red_waits) if red_waits else 0
avg_wait_yellow = np.mean(yellow_waits) if yellow_waits else 0
avg_wait_green = np.mean(green_waits) if green_waits else 0

red_served = len(red_waits)
yellow_served = len(yellow_waits)
green_served = len(green_waits)

pct_red_within = 100 * sum(w <= threshold_times["red"] for w in red_waits) / red_served if red_served else 0
pct_yellow_within = 100 * sum(w <= threshold_times["yellow"] for w in yellow_waits) / yellow_served if yellow_served else 0
pct_green_within = 100 * sum(w <= threshold_times["green"] for w in green_waits) / green_served if green_served else 0

queue_stats = {cat: {"avg": np.mean(qs), "max": np.max(qs)} for cat, qs in queue_lengths.items()}

fairness = np.mean(green_fair_actions) if green_fair_actions else 0

# -------------------------------
# Print results with 2 decimals
# -------------------------------
print(f"Average reward per episode: {avg_reward:.2f}")
print(f"Average wait times (Red, Yellow, Green): {avg_wait_red:.2f}, {avg_wait_yellow:.2f}, {avg_wait_green:.2f}")
print(f"Percentage served within thresholds (Red, Yellow, Green): {pct_red_within:.2f}%, {pct_yellow_within:.2f}%, {pct_green_within:.2f}%")
print("Queue stats (average and max lengths):")
for cat, stats in queue_stats.items():
    print(f"  {cat.capitalize()}: avg={stats['avg']:.2f}, max={stats['max']:.2f}")
print(f"Green fairness metric: {fairness:.2f}")


Average reward per episode: 1450.20
Average wait times (Red, Yellow, Green): 3.03, 4.33, 9.90
Percentage served within thresholds (Red, Yellow, Green): 86.19%, 100.00%, 100.00%
Queue stats (average and max lengths):
  Red: avg=2.98, max=13.00
  Yellow: avg=40.36, max=100.00
  Green: avg=44.44, max=102.00
Green fairness metric: 0.19


In [12]:
import sys
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

# Add environment path
sys.path.append("../env")  
from hospital_env import HospitalEnv

# -------------------------------
# Create wrapped evaluation environment
# -------------------------------
def make_env():
    env = HospitalEnv()
    env = Monitor(env)  # SB3 logging wrapper
    return env

eval_env = DummyVecEnv([make_env])

# -------------------------------
# Load trained DQN model
# -------------------------------
model = DQN.load("../models/dqn_hospital_sb3", env=eval_env)

# -------------------------------
# Evaluation parameters
# -------------------------------
n_episodes = 10  # number of episodes to evaluate
threshold_times = {"red": 10, "yellow": 30, "green": 60}
reward_map = {"red": 30, "yellow": 20, "green": 5}
reward_bonus = {"red": 25, "yellow": 15, "green": 5}

# Storage for metrics
rewards_per_episode = []
total_possible_rewards = []
red_waits, yellow_waits, green_waits = [], [], []
queue_lengths = {"red": [], "yellow": [], "green": []}
green_served_count = 0
total_served_count = 0

# -------------------------------
# Evaluate model
# -------------------------------
for ep in range(n_episodes):
    obs = eval_env.reset()  # only obs returned
    done = False
    episode_reward = 0
    episode_possible_reward = 0

    while not done:
        # Predict action
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        episode_reward += reward[0]

        # Access raw environment
        env = eval_env.envs[0].unwrapped

        # Record last served wait times and compute possible rewards
        if env.last_served_wait_times["red"] > 0:
            red_waits.append(env.last_served_wait_times["red"])
            episode_possible_reward += reward_map["red"] + reward_bonus["red"]
            total_served_count += 1
        if env.last_served_wait_times["yellow"] > 0:
            yellow_waits.append(env.last_served_wait_times["yellow"])
            episode_possible_reward += reward_map["yellow"] + reward_bonus["yellow"]
            total_served_count += 1
        if env.last_served_wait_times["green"] > 0:
            green_waits.append(env.last_served_wait_times["green"])
            episode_possible_reward += reward_map["green"] + reward_bonus["green"]
            green_served_count += 1
            total_served_count += 1

        # Record queue lengths
        queue_lengths["red"].append(len(env.red_queue))
        queue_lengths["yellow"].append(len(env.yellow_queue))
        queue_lengths["green"].append(len(env.green_queue))

    rewards_per_episode.append(episode_reward)
    total_possible_rewards.append(episode_possible_reward)

# -------------------------------
# Compute metrics
# -------------------------------
avg_reward = np.mean(rewards_per_episode)
avg_possible = np.mean(total_possible_rewards)
avg_wait_red = np.mean(red_waits) if red_waits else 0
avg_wait_yellow = np.mean(yellow_waits) if yellow_waits else 0
avg_wait_green = np.mean(green_waits) if green_waits else 0

pct_red_within = 100 * sum(w <= threshold_times["red"] for w in red_waits) / len(red_waits) if red_waits else 0
pct_yellow_within = 100 * sum(w <= threshold_times["yellow"] for w in yellow_waits) / len(yellow_waits) if yellow_waits else 0
pct_green_within = 100 * sum(w <= threshold_times["green"] for w in green_waits) / len(green_waits) if green_waits else 0

queue_stats = {cat: {"avg": np.mean(qs), "max": np.max(qs)} for cat, qs in queue_lengths.items()}

# Fairness: GREEN patients served / total served
fairness = green_served_count / total_served_count if total_served_count > 0 else 0

# -------------------------------
# Print results
# -------------------------------
print(f"Average reward per episode: {avg_reward:.2f}")
print(f"Average possible reward per episode: {avg_possible:.2f}")
print(f"Average wait times (Red, Yellow, Green): {avg_wait_red:.2f}, {avg_wait_yellow:.2f}, {avg_wait_green:.2f}")
print(f"Percentage served within thresholds (Red, Yellow, Green): {pct_red_within:.2f}%, {pct_yellow_within:.2f}%, {pct_green_within:.2f}%")
print("Queue stats (average and max lengths):")
for cat, stats in queue_stats.items():
    print(f"  {cat.capitalize()}: avg={stats['avg']:.2f}, max={stats['max']:.2f}")
print(f"Fairness (GREEN served / total served): {fairness:.2f}")



Average reward per episode: 1635.10
Average possible reward per episode: 2413.00
Average wait times (Red, Yellow, Green): 3.42, 4.80, 6.39
Percentage served within thresholds (Red, Yellow, Green): 100.00%, 100.00%, 100.00%
Queue stats (average and max lengths):
  Red: avg=3.48, max=12.00
  Yellow: avg=41.00, max=98.00
  Green: avg=42.79, max=103.00
Fairness (GREEN served / total served): 0.27
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




At 0 timesteps → 1635.1
At 10000 timesteps → 1556.8
At 20000 timesteps → 1615.2
At 30000 timesteps → 1584.4
At 40000 timesteps → 1677.0
At 50000 timesteps → 1636.7
At 60000 timesteps → 1627.2

Model converged at ~60000 timesteps.



In [1]:
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from hospital_env import HospitalEnv  # replace with your actual path

# 1️⃣ Create and wrap environment
env = HospitalEnv()
env = Monitor(env)

# 2️⃣ Convergence check function
def check_convergence(mean_rewards, threshold=10):
    """
    Checks if the last two rewards differ by less than threshold.
    Returns True = converged.
    """
    if len(mean_rewards) < 2:
        return False
    return abs(mean_rewards[-1] - mean_rewards[-2]) < threshold

# 3️⃣ Load your trained model (first one)
model = DQN.load("../models/dqn_hospital_sb3", env=env)

# 4️⃣ Evaluate over timesteps
mean_rewards_history = []
timesteps_history = []

for t in range(0, 200000, 10000):  # adjust based on saved checkpoints
    # If you have separate checkpoints, load each one like this:
    # model = DQN.load(f"../models/checkpoints/model_{t}", env=env)
    
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True)
    mean_rewards_history.append(mean_reward)
    timesteps_history.append(t)

    print(f"At {t} timesteps → mean reward: {mean_reward:.2f}")

    if check_convergence(mean_rewards_history):
        print(f"\nModel converged at ~{t} timesteps.\n")
        break


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Wrapping the env in a DummyVecEnv.
At 0 timesteps → mean reward: 1635.10
At 10000 timesteps → mean reward: 1556.80
At 20000 timesteps → mean reward: 1615.20
At 30000 timesteps → mean reward: 1584.40
At 40000 timesteps → mean reward: 1677.00
At 50000 timesteps → mean reward: 1636.70
At 60000 timesteps → mean reward: 1627.20

Model converged at ~60000 timesteps.



In [13]:
import os
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

# ----------------------------
# 1. IMPORT YOUR CUSTOM ENV
# ----------------------------
from hospital_env import HospitalEnv   # <-- make sure this matches your file name

# ----------------------------
# 2. CREATE ENVIRONMENT
# ----------------------------
env = HospitalEnv()
env = Monitor(env)   # fixes the warning

# ----------------------------
# 3. LOAD TRAINED MODEL
# ----------------------------
model_path = "../models/dqn_hospital_sb3.zip"   # your saved model

if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model not found at: {model_path}")

model = DQN.load(model_path, env=env)

print("Model loaded successfully!")

# ----------------------------
# 4. EVALUATE MODEL
# ----------------------------
mean_reward, std_reward = evaluate_policy(
    model,
    env,
    n_eval_episodes=20,
    deterministic=True
)

print(f"\nFinal Evaluation → Mean Reward: {mean_reward:.2f} | Std: {std_reward:.2f}\n")

# ----------------------------
# 5. OPTIONAL: RUN ONE EPISODE
# ----------------------------
obs, _ = env.reset()

done = False
episode_reward = 0

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    episode_reward += reward

print(f"One Demo Episode Reward: {episode_reward:.2f}")


Wrapping the env in a DummyVecEnv.
Model loaded successfully!

Final Evaluation → Mean Reward: 1595.95 | Std: 149.98



TypeError: unhashable type: 'numpy.ndarray'

In [14]:
# evaluation_hospital.py
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

# 1️⃣ Import your custom environment
from hospital_env import HospitalEnv  # replace with your actual file/path

# 2️⃣ Create the environment and wrap with Monitor
env = HospitalEnv()
env = Monitor(env)

# 3️⃣ Load your trained model
model_path = "../models/dqn_hospital_sb3"  # path to your saved DQN
model = DQN.load(model_path, env=env)

# 4️⃣ Evaluate the model
mean_reward, std_reward = evaluate_policy(
    model,
    env,
    n_eval_episodes=10,    # number of evaluation episodes
    deterministic=True
)

print(f"Mean reward over 10 episodes: {mean_reward:.2f}")
print(f"Standard deviation of reward: {std_reward:.2f}")

# 5️⃣ Optional: Track convergence over multiple checkpoints
# Example: checkpoints saved during training
checkpoints = [0, 10000, 20000, 30000]  # replace with your actual saved checkpoints
mean_rewards_history = []

for t in checkpoints:
    checkpoint_path = f"../models/checkpoints/model_{t}"  # no .zip added if already saved
    checkpoint_model = DQN.load(checkpoint_path, env=env)
    mean_r, _ = evaluate_policy(checkpoint_model, env, n_eval_episodes=5, deterministic=True)
    mean_rewards_history.append(mean_r)
    print(f"At {t} timesteps → mean reward: {mean_r:.2f}")

# 6️⃣ Plot convergence curve
plt.plot(checkpoints, mean_rewards_history, marker='o')
plt.xlabel("Timesteps")
plt.ylabel("Mean Reward")
plt.title("Convergence Curve")
plt.grid(True)
plt.show()


Wrapping the env in a DummyVecEnv.
Mean reward over 10 episodes: 1635.10
Standard deviation of reward: 138.76


FileNotFoundError: [Errno 2] No such file or directory: '..\\models\\checkpoints\\model_0.zip'

In [15]:
# auto_evaluate_hospital.py
import os
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

# 1️⃣ Import your custom environment
from hospital_env import HospitalEnv  # replace with your actual file/path

# 2️⃣ Create environment and wrap with Monitor
env = HospitalEnv()
env = Monitor(env)

# 3️⃣ Folder containing checkpoints
checkpoints_folder = "../models/checkpoints"

# 4️⃣ List all checkpoint files
all_files = sorted(os.listdir(checkpoints_folder))
checkpoint_files = [f for f in all_files if f.endswith(".zip")]

if not checkpoint_files:
    print("No checkpoint files found in:", checkpoints_folder)
    exit()

print("Found checkpoints:", checkpoint_files)

# 5️⃣ Evaluate each checkpoint
mean_rewards_history = []
for file in checkpoint_files:
    checkpoint_path = os.path.join(checkpoints_folder, file)
    print(f"\nEvaluating {file} ...")
    
    model = DQN.load(checkpoint_path, env=env)
    
    mean_r, std_r = evaluate_policy(
        model,
        env,
        n_eval_episodes=5,   # adjust as needed
        deterministic=True
    )
    mean_rewards_history.append(mean_r)
    print(f"Mean reward: {mean_r:.2f}, Std: {std_r:.2f}")

# 6️⃣ Plot convergence curve
plt.figure(figsize=(8,5))
plt.plot(range(len(checkpoint_files)), mean_rewards_history, marker='o')
plt.xticks(range(len(checkpoint_files)), checkpoint_files, rotation=45)
plt.xlabel("Checkpoints")
plt.ylabel("Mean Reward")
plt.title("Convergence Curve of DQN on HospitalEnv")
plt.grid(True)
plt.tight_layout()
plt.show()


FileNotFoundError: [WinError 3] The system cannot find the path specified: '../models/checkpoints'

In [1]:
# auto_evaluate_hospital_windows.py
import os
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

# 1️⃣ Import your custom environment
from hospital_env import HospitalEnv  # replace with your actual file/path

# 2️⃣ Create the environment and wrap with Monitor
env = HospitalEnv()
env = Monitor(env)

# 3️⃣ Absolute path to the checkpoints folder (update this to your folder)
checkpoints_folder = r"C:\rl_env\models\checkpoints"  # <-- change to your actual path

# 4️⃣ Verify the folder exists
if not os.path.exists(checkpoints_folder):
    print("Checkpoints folder not found:", checkpoints_folder)
    exit()

# 5️⃣ List all checkpoint files
all_files = sorted(os.listdir(checkpoints_folder))
checkpoint_files = [f for f in all_files if f.endswith(".zip")]

if not checkpoint_files:
    print("No checkpoint files found in:", checkpoints_folder)
    exit()

print("Found checkpoints:", checkpoint_files)

# 6️⃣ Evaluate each checkpoint
mean_rewards_history = []

for file in checkpoint_files:
    checkpoint_path = os.path.join(checkpoints_folder, file)
    print(f"\nEvaluating {file} ...")
    
    model = DQN.load(checkpoint_path, env=env)
    
    mean_r, std_r = evaluate_policy(
        model,
        env,
        n_eval_episodes=5,   # number of evaluation episodes
        deterministic=True
    )
    mean_rewards_history.append(mean_r)
    print(f"Mean reward: {mean_r:.2f}, Std: {std_r:.2f}")

# 7️⃣ Plot convergence curve
plt.figure(figsize=(8,5))
plt.plot(range(len(checkpoint_files)), mean_rewards_history, marker='o')
plt.xticks(range(len(checkpoint_files)), checkpoint_files, rotation=45)
plt.xlabel("Checkpoints")
plt.ylabel("Mean Reward")
plt.title("Convergence Curve of DQN on HospitalEnv")
plt.grid(True)
plt.tight_layout()
plt.show()


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Checkpoints folder not found: C:\rl_env\models\checkpoints


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\rl_env\\models\\checkpoints'