# PPO AGENT

In [7]:
# === 1) Imports ===
import yaml
import numpy as np
import torch
from env import create_env
from algorithms.ppo import PPOAgent

# If you want progress bars:
from tqdm import tqdm


  from pkg_resources import resource_stream, resource_exists


In [14]:
# === 2) Load configs ===
ENV_CONFIG   = "./configs/env.yaml"
MODEL_CONFIG   = "./configs/ppo.yaml"

with open(ENV_CONFIG, "r") as f_env:
    env_cfg = yaml.safe_load(f_env)

with open(MODEL_CONFIG, "r") as f_ppo:
    ppo_cfg = yaml.safe_load(f_ppo)

print("Loaded environment config:", env_cfg)
print("Loaded PPO config:", ppo_cfg)


Loaded environment config: {'env': 'intersection-v0', 'config': {'duration': 50, 'simulation_frequency': 15, 'policy_frequency': 2, 'destination': 'o1', 'initial_vehicle_count': 10, 'spawn_probability': 0.6, 'observation': {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True}, 'vehicle': {'acceleration': 3.0, 'steering': 0.4}, 'action': {'type': 'ContinuousAction'}, 'high_speed_reward': 0.0, 'collision_reward': -100.0, 'arrived_reward': 50.0, 'reward_speed_range': [0.0, 3.0], 'normalize_reward': False, 'collision_terminal': True, 'offroad_terminal': True}, 'wrapper_config': {'steer_factor': 2.0, 'speed_factor': 5.0, 'onroad_reward': 5.0, 'progress_reward': 5.0, 'wrongexit_penalty': 20.0, 'offroad_penalty': 20.0, 'collision_penalty': 100.0, 'collision_terminal': True, 'offroad_terminal': True}}
Loaded PPO co

In [15]:
# === 3) Create the Gym/Highway environment ===

env = create_env(config_filepath=ENV_CONFIG, render_mode=None)
# Some envs (like highway-env) ignore render_mode=None and only do rendering if you call env.render().

# (Optional) set seed
SEED = 42
env.reset(seed=SEED)


(array([[ 1.0000000e+00,  2.0000000e-02,  4.6569738e-01,  0.0000000e+00,
         -5.0000000e-01,  6.1232343e-17, -1.0000000e+00],
        [ 1.0000000e+00, -2.3217291e-01, -4.4569737e-01,  3.7858361e-01,
          5.0000000e-01,  1.0000000e+00, -2.0576134e-16],
        [ 1.0000000e+00, -4.8081377e-01, -4.4569737e-01,  4.0000001e-01,
          5.0000000e-01,  1.0000000e+00, -5.7731595e-17],
        [ 1.0000000e+00,  7.8728282e-01, -4.8569736e-01, -3.1105015e-01,
          5.0000000e-01, -1.0000000e+00,  1.2246469e-16],
        [ 1.0000000e+00,  5.0120562e-01, -4.8569736e-01, -4.3888959e-01,
          5.0000000e-01, -1.0000000e+00,  1.2246469e-16],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
          

In [16]:
# === 4) Figure out state_size and action_size exactly as you did for REINFORCE ===
# For continuous‐action highway-env, observation_space.shape might be something like (5,) or (84,84,3) if you chose image.
obs_shape  = env.observation_space.shape
state_size = int(np.prod(obs_shape))         # flatten everything
action_size= int(env.action_space.shape[0])  # # of continuous dims: e.g. [accel, steering]

print(f"State size: {state_size}, Action size: {action_size}")


State size: 70, Action size: 2


In [25]:
import numpy as np

print("Obs‐space:", env.observation_space)        # e.g. Box(low=…, high=…, shape=(6,), dtype=float32)
# Grab a single observation
sample_state = env.observation_space.sample()      # this is a NumPy array of shape (6,)
print("One sample state:", sample_state)
print("sample_state.shape →", np.array(sample_state).shape)



# obs_shape  = env.observation_space.shape      # → (10, 7)
# state_size = int(np.prod(obs_shape))          # → 10*7 = 70
# print("Using state_size =", state_size)       # should print “Using state_size = 70”



Obs‐space: Box(-inf, inf, (10, 7), float32)
One sample state: [[-0.8285017   1.5437019  -0.6186735   1.4928973   0.77766263 -0.03846077
  -0.75634974]
 [-0.76338345  0.8503617  -0.4950546   0.6391455  -1.7446786  -0.3492151
  -0.88039607]
 [-0.6594557   0.08536605 -0.6350263  -0.74557966 -0.8318835  -0.0448039
  -0.2021981 ]
 [-0.92255825 -0.75861406 -2.1740499   0.8802994  -0.67659503  0.46809664
  -0.74759203]
 [-0.38900954 -0.3746088  -0.24088493 -1.4239712  -0.01143532  0.7074346
   0.26056388]
 [-0.04506813  0.9396974  -0.7119521   0.5658825  -1.5593926  -1.871649
   0.8139259 ]
 [-1.2227253  -0.5659393   0.04407877  0.8774446  -0.1609159  -0.84062713
   1.0519067 ]
 [-1.3404357   1.0662632   0.273187    0.6322325   0.53295463 -1.4123174
   0.3152672 ]
 [-1.9568832  -0.7246894   0.19417413 -0.28590465  0.4958102  -0.09689894
  -0.7736633 ]
 [-0.34719574  1.6895012  -1.8053689   1.0755097  -0.46364075 -0.4984408
  -0.34496853]]
sample_state.shape → (10, 7)


In [26]:
# === 5) Instantiate PPOAgent ===
#    Match the names in your ppo.py __init__ signature.
agent = PPOAgent(
    state_size      = state_size,          
    hidden_size     = ppo_cfg["hidden_size"],
    action_size     = action_size,
    learning_rate   = ppo_cfg["learning_rate"],
    gamma           = ppo_cfg["gamma"],
    clip_epsilon    = ppo_cfg["clip_epsilon"],
    k_epochs        = ppo_cfg["k_epochs"],
    gae_lambda      = ppo_cfg.get("gae_lambda", 0.95),
    entropy_coef    = ppo_cfg["entropy_coef"],
    value_loss_coef = ppo_cfg["value_loss_coef"],
    max_grad_norm   = ppo_cfg.get("max_grad_norm", 0.5),
    update_timestep = ppo_cfg["update_timestep"],
    model_path      = ppo_cfg["model_path"]
)


# (Optional) If you have a pretrained PPO model you want to load:
# agent.load_model(ppo_cfg["model_path"])


In [27]:
# === 6) Call train(...) ===
#    Notice that PPOAgent.train takes an extra argument update_timestep.
num_episodes    = ppo_cfg["num_episodes"]
update_timestep = ppo_cfg["update_timestep"]
print_freq      = ppo_cfg["print_freq"]
save_freq       = ppo_cfg["save_freq"]

agent.train(
    env,
    num_episodes    = num_episodes,
    update_timestep = update_timestep,
    print_freq      = print_freq,
    save_freq       = save_freq
)


Training PPO Agent:   0%|          | 0/2000 [00:00<?, ?it/s]


ValueError: expected sequence of length 10 at dim 1 (got 6)

In [None]:
# === 7) (Optional) After training, close the env or save final model ===
agent.save_model(ppo_cfg["model_path"])
env.close()
