# A2C (Advantage Actor-Critic)

In [None]:
### One-cell script to run evaluation
import yaml
import numpy as np
from env import create_env
from algorithms.a2c import A2CAgent

SEED = 42
ENV_CONFIG = './configs/env.yaml'
MODEL_CONFIG = './configs/a2c.yaml'
# MODEL_PATH = './models/a2c_ep1000.pth'
MODEL_PATH = './models/a2c.pth'

eval_env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode='rgb_array',
)
eval_env.reset(seed=SEED)

# Display env configs
print("Environment configuration:")
for key in eval_env.config.keys():
    print(f'{key}: {eval_env.config[key]}')

with open(MODEL_CONFIG, 'r') as file:
    config = yaml.safe_load(file)
    print("Model configuration:")
    print(config)

state_size = np.prod(eval_env.observation_space.shape)
action_size = eval_env.action_space.shape[0]
print(f"State size: {state_size}, Action size: {action_size}")
agent = A2CAgent(
    state_size=state_size,
    hidden_size=config['hidden_size'],
    action_size=action_size,
    learning_rate=config['learning_rate'],
    gamma=config['gamma'],
    model_path=MODEL_PATH,
)

agent.load_model(
    model_path=MODEL_PATH,
)

agent.evaluate(
    env=eval_env,
    num_episodes=config['num_episodes_eval'],
    top_k=config['top_k'],
)

  from pkg_resources import resource_stream, resource_exists


Environment configuration:
observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True}
action: {'type': 'ContinuousAction'}
simulation_frequency: 15
policy_frequency: 2
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 50
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -100.0
high_speed_reward: 0.0
arrived_reward: 50.0
reward_speed_range: [0.0, 3.0]
normalize_reward: False
offroad_terminal: True
vehicle: {'acceleration': 3.0, 'steering': 0.4}
collision_terminal: True
Model configuration:
{'hidden_size': 128, 'learning_rate':

Evaluating A2C Agent: 100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


: 

In [1]:
import yaml
import numpy as np
from env import create_env
from algorithms.a2c import A2CAgent
%reload_ext autoreload
%autoreload 2

SEED = 42
ENV_CONFIG = './configs/env.yaml'
MODEL_CONFIG = './configs/a2c.yaml'
MODEL_PATH = './models/a2c.pth'

  from pkg_resources import resource_stream, resource_exists


In [2]:
env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode=None,
)
env.reset(seed=SEED)

# Display env configs
for key in env.config.keys():
    print(f'{key}: {env.config[key]}')

observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True}
action: {'type': 'ContinuousAction'}
simulation_frequency: 15
policy_frequency: 2
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 50
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -100.0
high_speed_reward: 0.0
arrived_reward: 50.0
reward_speed_range: [0.0, 3.0]
normalize_reward: False
offroad_terminal: True
vehicle: {'acceleration': 3.0, 'steering': 0.4}
collision_terminal: True


## Load Model Configs

In [3]:
with open(MODEL_CONFIG, 'r') as file:
    config = yaml.safe_load(file)

for key in config.keys():
    print(f'{key}: {config[key]}')

hidden_size: 128
learning_rate: 0.005
gamma: 0.95
num_episodes_train: 3000
print_freq: 100
save_freq: 1000
num_episodes_eval: 100
top_k: 5


## Create Agent

In [4]:
state_size = np.prod(env.observation_space.shape)
action_size = env.action_space.shape[0]
print(f"State size: {state_size}, Action size: {action_size}")
agent = A2CAgent(
    state_size=state_size,
    hidden_size=config['hidden_size'],
    action_size=action_size,
    learning_rate=config['learning_rate'],
    gamma=config['gamma'],
    model_path=MODEL_PATH,
)

State size: 70, Action size: 2


## Train Agent

In [5]:
agent.train(
    env=env,
    num_episodes=config['num_episodes_train'],
    print_freq=config['print_freq'],
    save_freq=config['save_freq'],
)

Training A2C Agent:   0%|          | 1/3000 [00:00<05:21,  9.32it/s]

Max reward: 1.00 at episode 1


Training A2C Agent:   3%|▎         | 100/3000 [00:22<09:05,  5.32it/s]

Episode 100/3000 | Max reward: 1.00 | Avg reward: 1.00


Training A2C Agent:   7%|▋         | 200/3000 [00:37<06:39,  7.00it/s]

Episode 200/3000 | Max reward: 1.00 | Avg reward: 1.00


Training A2C Agent:  10%|█         | 301/3000 [00:59<06:29,  6.94it/s]

Episode 300/3000 | Max reward: 1.00 | Avg reward: 1.00


Training A2C Agent:  13%|█▎        | 401/3000 [01:18<06:49,  6.35it/s]

Episode 400/3000 | Max reward: 1.00 | Avg reward: 1.00


Training A2C Agent:  17%|█▋        | 501/3000 [01:36<07:25,  5.61it/s]

Episode 500/3000 | Max reward: 1.00 | Avg reward: 1.00


Training A2C Agent:  20%|██        | 601/3000 [01:56<06:43,  5.95it/s]

Episode 600/3000 | Max reward: 1.00 | Avg reward: 1.00


Training A2C Agent:  21%|██        | 626/3000 [02:02<07:02,  5.62it/s]

Max reward: 2.00 at episode 625


Training A2C Agent:  23%|██▎       | 701/3000 [02:15<07:19,  5.23it/s]

Episode 700/3000 | Max reward: 2.00 | Avg reward: 1.00


Training A2C Agent:  25%|██▍       | 737/3000 [02:21<04:54,  7.70it/s]

Max reward: 3.00 at episode 736


Training A2C Agent:  27%|██▋       | 800/3000 [02:33<06:38,  5.52it/s]

Episode 800/3000 | Max reward: 3.00 | Avg reward: 1.01


Training A2C Agent:  30%|███       | 901/3000 [03:00<08:51,  3.95it/s]

Episode 900/3000 | Max reward: 3.00 | Avg reward: 1.01


Training A2C Agent:  33%|███▎      | 1001/3000 [03:20<06:42,  4.97it/s]

Episode 1000/3000 | Max reward: 3.00 | Avg reward: 1.01
Model saved to ./models/a2c_ep1000.pth


Training A2C Agent:  37%|███▋      | 1101/3000 [03:43<05:03,  6.27it/s]

Episode 1100/3000 | Max reward: 3.00 | Avg reward: 1.01


Training A2C Agent:  40%|███▉      | 1192/3000 [04:10<09:02,  3.33it/s]

Max reward: 5.00 at episode 1192
Model saved to ./models/a2c.pth


Training A2C Agent:  40%|████      | 1201/3000 [04:13<07:27,  4.02it/s]

Episode 1200/3000 | Max reward: 5.00 | Avg reward: 1.01


Training A2C Agent:  43%|████▎     | 1300/3000 [04:42<13:09,  2.15it/s]

Episode 1300/3000 | Max reward: 5.00 | Avg reward: 1.02


Training A2C Agent:  47%|████▋     | 1401/3000 [05:10<04:51,  5.48it/s]

Episode 1400/3000 | Max reward: 5.00 | Avg reward: 1.02


Training A2C Agent:  50%|█████     | 1500/3000 [05:27<04:23,  5.69it/s]

Episode 1500/3000 | Max reward: 5.00 | Avg reward: 1.03


Training A2C Agent:  53%|█████▎    | 1602/3000 [05:45<02:57,  7.86it/s]

Episode 1600/3000 | Max reward: 5.00 | Avg reward: 1.03


Training A2C Agent:  57%|█████▋    | 1701/3000 [06:00<02:47,  7.76it/s]

Episode 1700/3000 | Max reward: 5.00 | Avg reward: 1.04


Training A2C Agent:  60%|██████    | 1801/3000 [06:16<02:56,  6.80it/s]

Episode 1800/3000 | Max reward: 5.00 | Avg reward: 1.04


Training A2C Agent:  63%|██████▎   | 1901/3000 [06:32<02:29,  7.36it/s]

Episode 1900/3000 | Max reward: 5.00 | Avg reward: 1.04


Training A2C Agent:  67%|██████▋   | 2000/3000 [06:54<02:11,  7.61it/s]

Episode 2000/3000 | Max reward: 5.00 | Avg reward: 1.04
Model saved to ./models/a2c_ep2000.pth


Training A2C Agent:  70%|███████   | 2101/3000 [07:14<02:54,  5.16it/s]

Episode 2100/3000 | Max reward: 5.00 | Avg reward: 1.04


Training A2C Agent:  73%|███████▎  | 2200/3000 [08:08<06:03,  2.20it/s]

Episode 2200/3000 | Max reward: 5.00 | Avg reward: 1.04


Training A2C Agent:  77%|███████▋  | 2301/3000 [08:22<01:12,  9.65it/s]

Episode 2300/3000 | Max reward: 5.00 | Avg reward: 1.04


Training A2C Agent:  80%|████████  | 2401/3000 [08:34<01:10,  8.51it/s]

Episode 2400/3000 | Max reward: 5.00 | Avg reward: 1.04


Training A2C Agent:  83%|████████▎ | 2501/3000 [08:45<00:48, 10.31it/s]

Episode 2500/3000 | Max reward: 5.00 | Avg reward: 1.03


Training A2C Agent:  87%|████████▋ | 2601/3000 [08:55<00:40,  9.94it/s]

Episode 2600/3000 | Max reward: 5.00 | Avg reward: 1.03


Training A2C Agent:  90%|█████████ | 2701/3000 [09:05<00:36,  8.26it/s]

Episode 2700/3000 | Max reward: 5.00 | Avg reward: 1.03


Training A2C Agent:  93%|█████████▎| 2801/3000 [09:16<00:21,  9.23it/s]

Episode 2800/3000 | Max reward: 5.00 | Avg reward: 1.03


Training A2C Agent:  97%|█████████▋| 2899/3000 [09:26<00:11,  8.99it/s]

Episode 2900/3000 | Max reward: 5.00 | Avg reward: 1.03


Training A2C Agent: 100%|██████████| 3000/3000 [09:48<00:00,  5.10it/s]

Episode 3000/3000 | Max reward: 5.00 | Avg reward: 1.03
Model saved to ./models/a2c_ep3000.pth
Training completed. Avg reward: 1.03





## Save Model Weights if Desired
#### Highest reward runs during training are automatically saved

In [7]:
agent.save_model(
    model_path=MODEL_PATH,
)

Model saved to ./models/a2c.pth


## Evaluate Agent Performance

In [8]:
eval_env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode='rgb_array',
)
eval_env.reset(seed=SEED)

# Display env configs
for key in eval_env.config.keys():
    print(f'{key}: {eval_env.config[key]}')

observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True}
action: {'type': 'ContinuousAction'}
simulation_frequency: 15
policy_frequency: 2
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 50
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -100.0
high_speed_reward: 0.0
arrived_reward: 50.0
reward_speed_range: [0.0, 3.0]
normalize_reward: False
offroad_terminal: True
vehicle: {'acceleration': 3.0, 'steering': 0.4}
collision_terminal: True


In [9]:
agent.evaluate(
    env=eval_env,
    num_episodes=config['num_episodes_eval'],
    top_k=config['top_k'],
)

Evaluating A2C Agent: 100%|██████████| 100/100 [00:27<00:00,  3.67it/s]


## Flops Measurement

In [14]:
from fvcore.nn import FlopCountAnalysis
import torch

a2c_agent = agent
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = (torch.randn(1, state_size).to(device),)  # Example input shape

flops = FlopCountAnalysis(a2c_agent.model, inputs)
print(f"FLOPs: {flops.total()}")  # Total FLOPs
print(flops.by_operator())        # FLOPs by operator (optional)

Unsupported operator aten::tanh encountered 2 time(s)


FLOPs: 25984
Counter({'linear': 25984})
