In [7]:
import torch
print(torch.cuda.is_available())

True


# REINFORCE (Policy Gradient)

In [1]:
### One-cell script to run evaluation
import yaml
import numpy as np
from env import create_env
from algorithms.reinforce import REINFORCEAgent

SEED = 42
ENV_CONFIG = './configs/env.yaml'
MODEL_CONFIG = './configs/reinforce.yaml'
MODEL_PATH = './models/models_o1/reinforce_best.pth'
# MODEL_PATH = './models_o1/reinforce_ep1000.pth'
# MODEL_PATH = './models_o1/reinforce_ep2000.pth'
# MODEL_PATH = './models_o1/reinforce_ep3000.pth'
# MODEL_PATH = './models_o2/reinforce_best.pth'
# MODEL_PATH = './models_o2/reinforce_ep1000.pth'

eval_env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode='rgb_array',
)
eval_env.reset(seed=SEED)

# Set rendering parameters
# eval_env.config.update({
#     'offscreen_rendering': False,
#     'real_time_rendering': True,
#     'render_agent': True,
#     'show_trajectories': False
# })


# Display env configs
print("Environment configuration:")
for key in eval_env.config.keys():
    print(f'{key}: {eval_env.config[key]}')

with open(MODEL_CONFIG, 'r') as file:
    config = yaml.safe_load(file)
    print("Model configuration:")
    print(config)

state_size = np.prod(eval_env.observation_space.shape)
action_size = eval_env.action_space.shape[0]
print(f"State size: {state_size}, Action size: {action_size}")
agent = REINFORCEAgent(
    state_size=state_size,
    hidden_size=config['hidden_size'],
    action_size=action_size,
    learning_rate=config['learning_rate'],
    gamma=config['gamma'],
    model_path=MODEL_PATH,
)

agent.load_model(
    model_path=MODEL_PATH,
)

agent.evaluate(
    env=eval_env,
    num_episodes=config['num_episodes_eval'],
    top_k=config['top_k'],
)

  from pkg_resources import resource_stream, resource_exists


Environment configuration:
observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True, 'normalize': True, 'include_road_info': True, 'include_vehicle_info': True, 'include_goal_info': True, 'history_length': 5}
action: {'type': 'ContinuousAction', 'continuous': True, 'normalize': True, 'clip_actions': True}
simulation_frequency: 10
policy_frequency: 10
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 100
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -20.0
high_speed_reward: 0.1
arrived_reward: 50.0
reward_speed_range: [0.

RuntimeError: Error(s) in loading state_dict for Sequential:
	size mismatch for 0.weight: copying a param with shape torch.Size([64, 70]) from checkpoint, the shape in current model is torch.Size([128, 70]).
	size mismatch for 0.bias: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([128]).
	size mismatch for 3.weight: copying a param with shape torch.Size([64, 64]) from checkpoint, the shape in current model is torch.Size([128, 128]).
	size mismatch for 3.bias: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([128]).
	size mismatch for 6.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([4, 128]).

In [2]:
import yaml
import numpy as np
from env import create_env
from algorithms.reinforce import REINFORCEAgent
%reload_ext autoreload
%autoreload 2

SEED = 42
ENV_CONFIG = './configs/env.yaml'
MODEL_CONFIG = './configs/reinforce.yaml'
MODEL_PATH = './models/models_o1/reinforce.pth'

In [3]:
env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode=None,
)
env.reset(seed=SEED)

# Display env configs
for key in env.config.keys():
    print(f'{key}: {env.config[key]}')

observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True, 'normalize': True, 'include_road_info': True, 'include_vehicle_info': True, 'include_goal_info': True, 'history_length': 5}
action: {'type': 'ContinuousAction', 'continuous': True, 'normalize': True, 'clip_actions': True}
simulation_frequency: 10
policy_frequency: 10
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 100
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -20.0
high_speed_reward: 0.1
arrived_reward: 50.0
reward_speed_range: [0.0, 3.0]
normalize_reward: T

## Load Model Configs

In [4]:
with open(MODEL_CONFIG, 'r') as file:
    config = yaml.safe_load(file)

for key in config.keys():
    print(f'{key}: {config[key]}')

hidden_size: 128
learning_rate: 0.001
gamma: 0.95
num_episodes_train: 3000
print_freq: 100
save_freq: 1000
num_episodes_eval: 100
top_k: 5


## Create Agent

In [5]:
state_size = np.prod(env.observation_space.shape)
action_size = env.action_space.shape[0]
print(f"State size: {state_size}, Action size: {action_size}")
agent = REINFORCEAgent(
    state_size=state_size,
    hidden_size=config['hidden_size'],
    action_size=action_size,
    learning_rate=config['learning_rate'],
    gamma=config['gamma'],
    model_path=MODEL_PATH,
)

State size: 70, Action size: 2
Using device: cuda


## Train Agent

In [6]:
agent.train(
    env=env,
    num_episodes=config['num_episodes_train'],
    print_freq=config['print_freq'],
    save_freq=config['save_freq'],
)

Training REINFORCE Agent:   0%|          | 1/3000 [00:00<30:17,  1.65it/s]

Max reward: 1.00 at episode 1


Training REINFORCE Agent:   1%|          | 34/3000 [00:19<1:08:06,  1.38s/it]

Max reward: 2.00 at episode 34


Training REINFORCE Agent:   2%|▏         | 45/3000 [00:26<32:57,  1.49it/s]  

Max reward: 9.00 at episode 45


Training REINFORCE Agent:   3%|▎         | 100/3000 [00:52<24:19,  1.99it/s]

New best recent average: 1.09 (consistency ratio: 0.12)
Model saved to ./models/models_o1/reinforce_best.pth
Episode 100/3000 | Max reward: 9.00 | Avg reward: 1.09 | Recent avg: 1.09 | Consistency ratio: 0.12 | Entropy coef: 0.050


Training REINFORCE Agent:   7%|▋         | 200/3000 [01:43<17:44,  2.63it/s]

Episode 200/3000 | Max reward: 9.00 | Avg reward: 1.08 | Recent avg: 1.07 | Consistency ratio: 0.13 | Entropy coef: 0.050


Training REINFORCE Agent:   8%|▊         | 241/3000 [02:03<46:20,  1.01s/it]

New best recent average: 1.14 (consistency ratio: 0.14)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  10%|█         | 300/3000 [02:33<24:33,  1.83it/s]

Episode 300/3000 | Max reward: 9.00 | Avg reward: 1.08 | Recent avg: 1.07 | Consistency ratio: 0.13 | Entropy coef: 0.050


Training REINFORCE Agent:  12%|█▏        | 370/3000 [03:18<1:21:00,  1.85s/it]

New best recent average: 1.74 (consistency ratio: 0.03)
Model saved to ./models/models_o1/reinforce_best.pth
Max reward: 68.00 at episode 370


Training REINFORCE Agent:  13%|█▎        | 379/3000 [03:25<1:11:45,  1.64s/it]

New best recent average: 2.40 (consistency ratio: 0.04)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  13%|█▎        | 400/3000 [03:34<17:42,  2.45it/s]  

Episode 400/3000 | Max reward: 68.00 | Avg reward: 1.41 | Recent avg: 2.40 | Consistency ratio: 0.04 | Entropy coef: 0.050


Training REINFORCE Agent:  17%|█▋        | 500/3000 [04:30<18:53,  2.20it/s]  

Episode 500/3000 | Max reward: 68.00 | Avg reward: 1.39 | Recent avg: 1.30 | Consistency ratio: 0.04 | Entropy coef: 0.050


Training REINFORCE Agent:  20%|██        | 601/3000 [05:26<19:07,  2.09it/s]  

Episode 600/3000 | Max reward: 68.00 | Avg reward: 1.32 | Recent avg: 1.00 | Consistency ratio: 1.00 | Entropy coef: 0.050


Training REINFORCE Agent:  23%|██▎       | 700/3000 [06:12<09:02,  4.24it/s]

Episode 700/3000 | Max reward: 68.00 | Avg reward: 1.37 | Recent avg: 1.65 | Consistency ratio: 0.02 | Entropy coef: 0.050


Training REINFORCE Agent:  27%|██▋       | 801/3000 [06:52<12:42,  2.88it/s]

Episode 800/3000 | Max reward: 68.00 | Avg reward: 1.33 | Recent avg: 1.05 | Consistency ratio: 0.18 | Entropy coef: 0.050


Training REINFORCE Agent:  27%|██▋       | 823/3000 [07:07<10:46,  3.37it/s]  

Max reward: 87.00 at episode 824


Training REINFORCE Agent:  29%|██▉       | 880/3000 [07:42<12:57,  2.72it/s]  

Reducing exploration due to stagnation (entropy coef: 0.025)


Training REINFORCE Agent:  30%|███       | 901/3000 [07:52<15:08,  2.31it/s]

Episode 900/3000 | Max reward: 87.00 | Avg reward: 1.44 | Recent avg: 2.35 | Consistency ratio: 0.03 | Entropy coef: 0.025


Training REINFORCE Agent:  33%|███▎      | 1001/3000 [08:32<07:14,  4.60it/s]

Episode 1000/3000 | Max reward: 87.00 | Avg reward: 1.41 | Recent avg: 1.15 | Consistency ratio: 0.10 | Entropy coef: 0.025
Model saved to ./models/models_o1/reinforce_ep1000.pth


Training REINFORCE Agent:  37%|███▋      | 1100/3000 [09:22<08:50,  3.58it/s]

Episode 1100/3000 | Max reward: 87.00 | Avg reward: 1.49 | Recent avg: 2.24 | Consistency ratio: 0.04 | Entropy coef: 0.025


Training REINFORCE Agent:  37%|███▋      | 1102/3000 [09:24<24:31,  1.29it/s]

New best recent average: 2.46 (consistency ratio: 0.04)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  38%|███▊      | 1129/3000 [09:39<38:27,  1.23s/it]

New best recent average: 2.93 (consistency ratio: 0.05)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  39%|███▉      | 1169/3000 [10:00<42:39,  1.40s/it]

New best recent average: 3.36 (consistency ratio: 0.05)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  40%|████      | 1200/3000 [10:12<11:50,  2.53it/s]

Episode 1200/3000 | Max reward: 87.00 | Avg reward: 1.56 | Recent avg: 2.36 | Consistency ratio: 0.04 | Entropy coef: 0.025


Training REINFORCE Agent:  43%|████▎     | 1300/3000 [11:01<14:00,  2.02it/s]

Episode 1300/3000 | Max reward: 87.00 | Avg reward: 1.62 | Recent avg: 2.33 | Consistency ratio: 0.06 | Entropy coef: 0.025


Training REINFORCE Agent:  47%|████▋     | 1400/3000 [11:44<08:06,  3.29it/s]

Episode 1400/3000 | Max reward: 87.00 | Avg reward: 1.59 | Recent avg: 1.25 | Consistency ratio: 0.05 | Entropy coef: 0.025


Training REINFORCE Agent:  50%|█████     | 1500/3000 [12:34<39:21,  1.57s/it]

Episode 1500/3000 | Max reward: 87.00 | Avg reward: 1.61 | Recent avg: 1.81 | Consistency ratio: 0.03 | Entropy coef: 0.025


Training REINFORCE Agent:  53%|█████▎    | 1600/3000 [13:25<07:20,  3.18it/s]

Episode 1600/3000 | Max reward: 87.00 | Avg reward: 1.63 | Recent avg: 1.99 | Consistency ratio: 0.03 | Entropy coef: 0.025


Training REINFORCE Agent:  56%|█████▌    | 1670/3000 [13:58<05:30,  4.03it/s]

Reducing exploration due to stagnation (entropy coef: 0.013)


Training REINFORCE Agent:  57%|█████▋    | 1700/3000 [14:12<07:44,  2.80it/s]

Episode 1700/3000 | Max reward: 87.00 | Avg reward: 1.61 | Recent avg: 1.33 | Consistency ratio: 0.04 | Entropy coef: 0.013


Training REINFORCE Agent:  57%|█████▋    | 1711/3000 [14:21<42:30,  1.98s/it]

Max reward: 94.00 at episode 1711


Training REINFORCE Agent:  59%|█████▉    | 1769/3000 [15:01<45:58,  2.24s/it]

New best recent average: 3.41 (consistency ratio: 0.04)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  60%|█████▉    | 1785/3000 [15:10<25:40,  1.27s/it]

New best recent average: 3.57 (consistency ratio: 0.04)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  60%|█████▉    | 1789/3000 [15:17<37:08,  1.84s/it]

New best recent average: 4.09 (consistency ratio: 0.04)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  60%|██████    | 1800/3000 [15:25<13:33,  1.47it/s]

Episode 1800/3000 | Max reward: 94.00 | Avg reward: 1.75 | Recent avg: 4.09 | Consistency ratio: 0.04 | Entropy coef: 0.013


Training REINFORCE Agent:  63%|██████▎   | 1900/3000 [16:12<07:46,  2.36it/s]

Episode 1900/3000 | Max reward: 94.00 | Avg reward: 1.74 | Recent avg: 1.49 | Consistency ratio: 0.06 | Entropy coef: 0.013


Training REINFORCE Agent:  67%|██████▋   | 2001/3000 [16:54<04:47,  3.47it/s]

Episode 2000/3000 | Max reward: 94.00 | Avg reward: 1.72 | Recent avg: 1.35 | Consistency ratio: 0.04 | Entropy coef: 0.013
Model saved to ./models/models_o1/reinforce_ep2000.pth


Training REINFORCE Agent:  70%|███████   | 2100/3000 [17:35<05:36,  2.68it/s]

Episode 2100/3000 | Max reward: 94.00 | Avg reward: 1.68 | Recent avg: 1.00 | Consistency ratio: 1.00 | Entropy coef: 0.013


Training REINFORCE Agent:  73%|███████▎  | 2200/3000 [18:28<04:31,  2.94it/s]

Episode 2200/3000 | Max reward: 94.00 | Avg reward: 1.72 | Recent avg: 2.41 | Consistency ratio: 0.03 | Entropy coef: 0.013


Training REINFORCE Agent:  76%|███████▋  | 2291/3000 [19:15<04:38,  2.55it/s]

Reducing exploration due to stagnation (entropy coef: 0.010)


Training REINFORCE Agent:  77%|███████▋  | 2300/3000 [19:18<04:07,  2.83it/s]

Episode 2300/3000 | Max reward: 94.00 | Avg reward: 1.72 | Recent avg: 1.81 | Consistency ratio: 0.02 | Entropy coef: 0.010


Training REINFORCE Agent:  78%|███████▊  | 2333/3000 [19:41<07:06,  1.56it/s]

New best recent average: 4.59 (consistency ratio: 0.02)
Model saved to ./models/models_o1/reinforce_best.pth
Max reward: 259.00 at episode 2334


Training REINFORCE Agent:  80%|████████  | 2400/3000 [20:48<03:11,  3.14it/s]  

Episode 2400/3000 | Max reward: 259.00 | Avg reward: 1.83 | Recent avg: 4.29 | Consistency ratio: 0.02 | Entropy coef: 0.010


Training REINFORCE Agent:  80%|████████  | 2405/3000 [20:54<17:04,  1.72s/it]

New best recent average: 4.94 (consistency ratio: 0.02)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  80%|████████  | 2413/3000 [21:02<16:55,  1.73s/it]

New best recent average: 5.73 (consistency ratio: 0.02)
Model saved to ./models/models_o1/reinforce_best.pth


Training REINFORCE Agent:  83%|████████▎ | 2501/3000 [21:41<09:03,  1.09s/it]

Episode 2500/3000 | Max reward: 259.00 | Avg reward: 1.89 | Recent avg: 3.37 | Consistency ratio: 0.04 | Entropy coef: 0.010


Training REINFORCE Agent:  87%|████████▋ | 2601/3000 [22:32<01:35,  4.16it/s]

Episode 2600/3000 | Max reward: 259.00 | Avg reward: 1.92 | Recent avg: 2.69 | Consistency ratio: 0.03 | Entropy coef: 0.010


Training REINFORCE Agent:  90%|█████████ | 2700/3000 [23:19<02:14,  2.23it/s]

Episode 2700/3000 | Max reward: 259.00 | Avg reward: 1.94 | Recent avg: 2.39 | Consistency ratio: 0.03 | Entropy coef: 0.010


Training REINFORCE Agent:  93%|█████████▎| 2800/3000 [24:08<01:16,  2.61it/s]

Episode 2800/3000 | Max reward: 259.00 | Avg reward: 1.91 | Recent avg: 1.28 | Consistency ratio: 0.04 | Entropy coef: 0.010


Training REINFORCE Agent:  94%|█████████▎| 2808/3000 [24:10<01:06,  2.90it/s]

Max reward: 289.00 at episode 2809


Training REINFORCE Agent:  97%|█████████▋| 2900/3000 [25:37<00:40,  2.44it/s]

Episode 2900/3000 | Max reward: 289.00 | Avg reward: 1.98 | Recent avg: 3.88 | Consistency ratio: 0.01 | Entropy coef: 0.010


Training REINFORCE Agent:  97%|█████████▋| 2914/3000 [25:42<00:38,  2.22it/s]

Reducing exploration due to stagnation (entropy coef: 0.010)


Training REINFORCE Agent: 100%|██████████| 3000/3000 [26:36<00:00,  1.88it/s]

Episode 3000/3000 | Max reward: 289.00 | Avg reward: 2.02 | Recent avg: 3.26 | Consistency ratio: 0.03 | Entropy coef: 0.010
Model saved to ./models/models_o1/reinforce_ep3000.pth
Training completed. Avg reward: 2.02





[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 9,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 8,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 8,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


## Save Model Weights if Desired
#### Highest reward runs during training are automatically saved

In [None]:
agent.save_model(
    model_path=MODEL_PATH,
)

## Evaluate Agent Performance

In [27]:
eval_env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode='rgb_array',
)
eval_env.reset(seed=SEED)

# Display env configs
for key in eval_env.config.keys():
    print(f'{key}: {eval_env.config[key]}')

observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True, 'normalize': True, 'include_road_info': True, 'include_vehicle_info': True, 'include_goal_info': True, 'history_length': 5}
action: {'type': 'ContinuousAction', 'continuous': True, 'normalize': True, 'clip_actions': True}
simulation_frequency: 10
policy_frequency: 10
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 100
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -20.0
high_speed_reward: 0.1
arrived_reward: 50.0
reward_speed_range: [0.0, 3.0]
normalize_reward: T

In [28]:
agent.load_model(
    model_path='./models/models_simple/reinforce_best.pth',
)

Model loaded from ./models/models_simple/reinforce_best.pth


True

In [30]:
agent.evaluate(
    env=eval_env,
    num_episodes=10,
    top_k=5,
)

Evaluating REINFORCE Agent:   0%|          | 0/10 [00:00<?, ?it/s]

Unexpected error during evaluation: too many values to unpack (expected 2)





[]