### Big Intersection Reinforcement Learning (DQN algorithmn) in Stauntan

In [2]:
# import libraries

import os
import sys
import random
import gymnasium as gym

if "SUMO_HOME" in os.environ:
    tools = os.path.join(os.environ["SUMO_HOME"], "tools")
    sys.path.append(tools)
else:
    sys.exit("Please declare the environment variable 'SUMO_HOME'")
import numpy as np
import traci
from stable_baselines3.dqn.dqn import DQN

from sumo_rl import SumoEnvironment


In [None]:
# set up SUMO environment

env = SumoEnvironment(
    net_file="4x4.net.xml",
    single_agent=False,
    route_file="4x4c1c2c1c2.rou.xml",
    out_csv_name="outputs/big-intersection/dqn",
    use_gui=True,
    num_seconds=1000,
    yellow_time=4,
    min_green=5,
    max_green=60,
)

agents = list(env.traffic_signals.keys())  # Get the list of traffic signal IDs

models = {agent: DQN(
    policy="MlpPolicy",
    env=env,
    learning_rate=0.001,
    learning_starts=0,
    buffer_size=50000,
    train_freq=1,
    target_update_interval=500,
    exploration_fraction=0.05,
    exploration_final_eps=0.01,
    verbose=0
) for agent in agents}

def pad_observation(obs, target_size):
    return np.pad(obs, (0, target_size - len(obs)), 'constant')

# Example target size (choose the max length observed)
#target_size = max(len(observation) for observation in obs.values())
target_size = 11

TIMESTEPS = 1000
for timestep in range(TIMESTEPS):
    obs = env.reset()  # Now returns a dict {agent_id: obs}

    done = {agent: False for agent in agents}

    while not all(done.values()):  # Continue until all agents finish
        print(1)

        obs = {agent: pad_observation(observation, target_size) for agent, observation in obs.items()}
        actions = {agent: models[agent].predict(obs[agent], deterministic=False)[0] for agent in agents}
        print(actions)
        print(env.traffic_signals[agents[0]].yellow_dict)
        
        next_obs, rewards, done, info = env.step(actions)
        print(2)
        for agent in agents:

            action = actions[agent]
            action = np.array(action, dtype=np.float32)

        if action.ndim == 0:  # Scalar case
            action = action.reshape(1, 1)
        elif action.ndim == 1:  # 1D array case
            action = action.reshape(1, -1)  # Ensure it's 2D

            # No need to reshape if it's already a scalar (float)
            
            models[agent].replay_buffer.add(obs[agent], action, rewards[agent], next_obs[agent], done[agent], infos=info)

        obs = next_obs

1
{'0': array(0, dtype=int64), '1': array(0, dtype=int64), '10': array(1, dtype=int64), '11': array(0, dtype=int64), '12': array(1, dtype=int64), '13': array(0, dtype=int64), '14': array(0, dtype=int64), '15': array(0, dtype=int64), '2': array(1, dtype=int64), '3': array(0, dtype=int64), '4': array(1, dtype=int64), '5': array(1, dtype=int64), '6': array(1, dtype=int64), '7': array(0, dtype=int64), '8': array(1, dtype=int64), '9': array(0, dtype=int64)}
{(0, 1): 2, (1, 0): 3}
2
1
{'0': array(0, dtype=int64), '1': array(0, dtype=int64), '10': array(1, dtype=int64), '11': array(0, dtype=int64), '12': array(1, dtype=int64), '13': array(0, dtype=int64), '14': array(0, dtype=int64), '15': array(0, dtype=int64), '2': array(1, dtype=int64), '3': array(0, dtype=int64), '4': array(1, dtype=int64), '5': array(1, dtype=int64), '6': array(1, dtype=int64), '7': array(0, dtype=int64), '8': array(1, dtype=int64), '9': array(0, dtype=int64)}
{(0, 1): 2, (1, 0): 3}
2
1
{'0': array(0, dtype=int64), '1': 

In [10]:
print(f"Action shape for agent {agent}: {actions[agent].shape}")


Action shape for agent 221237140: ()


In [11]:
obs

{'221237140': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221237260': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221237272': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221239539': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221241076': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221251086': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221261813': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 'cluster_221237147_7811951337_7811951340': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)}

In [3]:
print(env.observation_space)

Box(0.0, 1.0, (11,), float32)


In [4]:
obs

{'221237140': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221237260': array([1., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221237272': array([1., 0., 0., 0., 0., 0.], dtype=float32),
 '221239539': array([1., 0., 0., 0., 0., 0.], dtype=float32),
 '221241076': array([1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221251086': array([1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '221261813': array([1., 0., 0., 0., 0., 0.], dtype=float32),
 'cluster_221237147_7811951337_7811951340': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)}

In [None]:
# DQN model

model = DQN(
    env=env,
    policy="MlpPolicy", # specifying multi-layer perceptron policy
    learning_rate=1e-3, # smaller learning rate is more stable
    learning_starts=0, # starts training immediately 
    buffer_size=50000, # stores agent's experiences (larger buffer is more diverse, but uses more memory)
    train_freq=1, # how often the agent is trained, in steps
    target_update_interval=500, # frequency (in timestamps) in which the network is updated
    exploration_fraction=0.05, # 
    exploration_final_eps=0.01, # probability of the action taken at the end of exploration
    verbose=1, # whether or not algorithmn output is printed
)
model.learn(total_timesteps=1000) # total number of timesteps for which the model will interact with the environment and learn

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


<stable_baselines3.dqn.dqn.DQN at 0x2bf18592250>