# REINFORCE (Policy Gradient)

In [None]:
### One-cell script to run evaluation
import yaml
import numpy as np
from env import create_env
from algorithms.reinforce import REINFORCEAgent

SEED = 42
ENV_CONFIG = './configs/env.yaml'
MODEL_CONFIG = './configs/reinforce.yaml'
MODEL_PATH = './models/reinforce_ep3000.pth'

eval_env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode='rgb_array',
)
eval_env.reset(seed=SEED)

# Display env configs
print("Environment configuration:")
for key in eval_env.config.keys():
    print(f'{key}: {eval_env.config[key]}')

with open(MODEL_CONFIG, 'r') as file:
    config = yaml.safe_load(file)
    print("Model configuration:")
    print(config)

state_size = np.prod(eval_env.observation_space.shape)
action_size = eval_env.action_space.shape[0]
print(f"State size: {state_size}, Action size: {action_size}")
agent = REINFORCEAgent(
    state_size=state_size,
    hidden_size=config['hidden_size'],
    action_size=action_size,
    learning_rate=config['learning_rate'],
    gamma=config['gamma'],
    model_path=MODEL_PATH,
)

agent.load_model(
    model_path=MODEL_PATH,
)

agent.evaluate(
    env=eval_env,
    num_episodes=config['num_episodes_eval'],
    top_k=config['top_k'],
)

In [1]:
import yaml
import numpy as np
from env import create_env
from algorithms.reinforce import REINFORCEAgent
%reload_ext autoreload
%autoreload 2

SEED = 42
ENV_CONFIG = './configs/env.yaml'
MODEL_CONFIG = './configs/reinforce.yaml'
MODEL_PATH = './models/reinforce.pth'

In [2]:
env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode=None,
)
env.reset(seed=SEED)

# Display env configs
for key in env.config.keys():
    print(f'{key}: {env.config[key]}')

observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True}
action: {'type': 'ContinuousAction'}
simulation_frequency: 15
policy_frequency: 2
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 50
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -100.0
high_speed_reward: 0.0
arrived_reward: 50.0
reward_speed_range: [0.0, 3.0]
normalize_reward: False
offroad_terminal: True
vehicle: {'acceleration': 3.0, 'steering': 0.4}
collision_terminal: True


## Load Model Configs

In [3]:
with open(MODEL_CONFIG, 'r') as file:
    config = yaml.safe_load(file)

for key in config.keys():
    print(f'{key}: {config[key]}')

hidden_size: 64
learning_rate: 0.001
gamma: 0.8
num_episodes_train: 3000
print_freq: 100
save_freq: 1000
num_episodes_eval: 100
top_k: 5


## Create Agent

In [4]:
state_size = np.prod(env.observation_space.shape)
action_size = env.action_space.shape[0]
print(f"State size: {state_size}, Action size: {action_size}")
agent = REINFORCEAgent(
    state_size=state_size,
    hidden_size=config['hidden_size'],
    action_size=action_size,
    learning_rate=config['learning_rate'],
    gamma=config['gamma'],
    model_path=MODEL_PATH,
)

State size: 70, Action size: 2


## Train Agent

In [5]:
agent.train(
    env=env,
    num_episodes=config['num_episodes_train'],
    print_freq=config['print_freq'],
    save_freq=config['save_freq'],
)

Training REINFORCE Agent:   0%|          | 1/3000 [00:00<08:23,  5.95it/s]

Max reward: 2.00 at episode 1
Model saved to ./models/reinforce.pth


Training REINFORCE Agent:   1%|▏         | 38/3000 [00:04<05:46,  8.55it/s]

Max reward: 3.00 at episode 37
Model saved to ./models/reinforce.pth


Training REINFORCE Agent:   3%|▎         | 101/3000 [00:11<06:31,  7.41it/s]

Episode 100/3000 | Max reward: 3.00 | Avg reward: 1.07


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:   5%|▍         | 143/3000 [00:17<12:40,  3.76it/s]

Max reward: 4.00 at episode 143
Model saved to ./models/reinforce.pth


Training REINFORCE Agent:   5%|▍         | 145/3000 [00:19<23:43,  2.00it/s]

Max reward: 19.00 at episode 145
Model saved to ./models/reinforce.pth


Training REINFORCE Agent:   7%|▋         | 200/3000 [00:54<32:12,  1.45it/s]

Episode 200/3000 | Max reward: 19.00 | Avg reward: 2.71


Training REINFORCE Agent:   7%|▋         | 213/3000 [01:02<26:06,  1.78it/s]

Max reward: 20.00 at episode 212
Model saved to ./models/reinforce.pth


Training REINFORCE Agent:  10%|▉         | 289/3000 [01:56<44:51,  1.01it/s]  

Max reward: 21.00 at episode 289
Model saved to ./models/reinforce.pth


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  10%|█         | 300/3000 [02:04<25:31,  1.76it/s]

Episode 300/3000 | Max reward: 21.00 | Avg reward: 4.24


Training REINFORCE Agent:  13%|█▎        | 400/3000 [03:15<30:41,  1.41it/s]

Episode 400/3000 | Max reward: 21.00 | Avg reward: 5.12


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  17%|█▋        | 500/3000 [04:22<21:27,  1.94it/s]

Episode 500/3000 | Max reward: 21.00 | Avg reward: 5.44


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  20%|█▉        | 593/3000 [05:30<22:25,  1.79it/s]

Max reward: 22.00 at episode 592
Model saved to ./models/reinforce.pth


Training REINFORCE Agent:  20%|██        | 600/3000 [05:34<19:51,  2.01it/s]

Episode 600/3000 | Max reward: 22.00 | Avg reward: 5.82


Training REINFORCE Agent:  23%|██▎       | 700/3000 [06:38<26:16,  1.46it/s]

Episode 700/3000 | Max reward: 22.00 | Avg reward: 5.87


Training REINFORCE Agent:  27%|██▋       | 800/3000 [07:45<31:18,  1.17it/s]

Episode 800/3000 | Max reward: 22.00 | Avg reward: 6.11


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  30%|███       | 900/3000 [08:50<17:33,  1.99it/s]

Episode 900/3000 | Max reward: 22.00 | Avg reward: 6.25


Training REINFORCE Agent:  33%|███▎      | 1000/3000 [09:49<24:56,  1.34it/s]

Episode 1000/3000 | Max reward: 22.00 | Avg reward: 6.25
Model saved to ./models/reinforce_ep1000.pth


Training REINFORCE Agent:  37%|███▋      | 1100/3000 [10:52<21:01,  1.51it/s]

Episode 1100/3000 | Max reward: 22.00 | Avg reward: 6.30


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  40%|████      | 1200/3000 [11:55<16:57,  1.77it/s]

Episode 1200/3000 | Max reward: 22.00 | Avg reward: 6.34


Training REINFORCE Agent:  43%|████▎     | 1300/3000 [13:07<16:33,  1.71it/s]

Episode 1300/3000 | Max reward: 22.00 | Avg reward: 6.49


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  47%|████▋     | 1400/3000 [14:19<14:34,  1.83it/s]

Episode 1400/3000 | Max reward: 22.00 | Avg reward: 6.60


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  50%|█████     | 1500/3000 [15:25<15:20,  1.63it/s]

Episode 1500/3000 | Max reward: 22.00 | Avg reward: 6.71


Training REINFORCE Agent:  53%|█████▎    | 1600/3000 [16:29<23:36,  1.01s/it]

Episode 1600/3000 | Max reward: 22.00 | Avg reward: 6.73


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  57%|█████▋    | 1700/3000 [17:35<10:59,  1.97it/s]

Episode 1700/3000 | Max reward: 22.00 | Avg reward: 6.75


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  60%|██████    | 1800/3000 [18:37<11:10,  1.79it/s]

Episode 1800/3000 | Max reward: 22.00 | Avg reward: 6.74


Training REINFORCE Agent:  63%|██████▎   | 1900/3000 [19:46<15:34,  1.18it/s]

Episode 1900/3000 | Max reward: 22.00 | Avg reward: 6.82


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  67%|██████▋   | 2000/3000 [20:57<16:07,  1.03it/s]

Episode 2000/3000 | Max reward: 22.00 | Avg reward: 6.89
Model saved to ./models/reinforce_ep2000.pth


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  70%|███████   | 2100/3000 [22:04<11:00,  1.36it/s]

Episode 2100/3000 | Max reward: 22.00 | Avg reward: 6.92


Training REINFORCE Agent:  73%|███████▎  | 2200/3000 [23:09<05:36,  2.38it/s]

Episode 2200/3000 | Max reward: 22.00 | Avg reward: 6.94


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  77%|███████▋  | 2300/3000 [24:08<06:22,  1.83it/s]

Episode 2300/3000 | Max reward: 22.00 | Avg reward: 6.91


Training REINFORCE Agent:  80%|████████  | 2400/3000 [25:14<07:35,  1.32it/s]

Episode 2400/3000 | Max reward: 22.00 | Avg reward: 6.92


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  83%|████████▎ | 2500/3000 [26:21<05:58,  1.39it/s]

Episode 2500/3000 | Max reward: 22.00 | Avg reward: 6.97


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  87%|████████▋ | 2600/3000 [27:28<05:20,  1.25it/s]

Episode 2600/3000 | Max reward: 22.00 | Avg reward: 7.01


Training REINFORCE Agent:  90%|█████████ | 2700/3000 [28:30<02:48,  1.78it/s]

Episode 2700/3000 | Max reward: 22.00 | Avg reward: 7.02


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent:  93%|█████████▎| 2800/3000 [29:34<02:14,  1.49it/s]

Episode 2800/3000 | Max reward: 22.00 | Avg reward: 7.01


Training REINFORCE Agent:  97%|█████████▋| 2900/3000 [30:48<00:59,  1.68it/s]

Episode 2900/3000 | Max reward: 22.00 | Avg reward: 7.04


  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
  discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
Training REINFORCE Agent: 100%|██████████| 3000/3000 [31:59<00:00,  1.56it/s]

Episode 3000/3000 | Max reward: 22.00 | Avg reward: 7.07
Model saved to ./models/reinforce_ep3000.pth
Training completed. Avg reward: 7.07





## Save Model Weights if Desired
#### Highest reward runs during training are automatically saved

In [None]:
agent.save_model(
    model_path=MODEL_PATH,
)

## Evaluate Agent Performance

In [6]:
eval_env = create_env(
    config_filepath=ENV_CONFIG,
    render_mode='rgb_array',
)
eval_env.reset(seed=SEED)

# Display env configs
for key in eval_env.config.keys():
    print(f'{key}: {eval_env.config[key]}')

observation: {'type': 'Kinematics', 'vehicles_count': 10, 'features': ['presence', 'x', 'y', 'vx', 'vy', 'cos_h', 'sin_h'], 'features_range': {'x': [-100, 100], 'y': [-100, 100], 'vx': [-20, 20], 'vy': [-20, 20]}, 'absolute': False, 'sorted': True}
action: {'type': 'ContinuousAction'}
simulation_frequency: 15
policy_frequency: 2
other_vehicles_type: highway_env.vehicle.behavior.IDMVehicle
screen_width: 600
screen_height: 600
centering_position: [0.5, 0.6]
scaling: 7.15
show_trajectories: False
render_agent: True
offscreen_rendering: False
manual_control: False
real_time_rendering: False
duration: 50
destination: o1
controlled_vehicles: 1
initial_vehicle_count: 10
spawn_probability: 0.6
collision_reward: -100.0
high_speed_reward: 0.0
arrived_reward: 50.0
reward_speed_range: [0.0, 3.0]
normalize_reward: False
offroad_terminal: True
vehicle: {'acceleration': 3.0, 'steering': 0.4}
collision_terminal: True


In [None]:
agent.evaluate(
    env=eval_env,
    num_episodes=config['num_episodes_eval'],
    top_k=config['top_k'],
)

Evaluating REINFORCE Agent: 100%|██████████| 100/100 [01:10<00:00,  1.42it/s]


: 