In [45]:
%load_ext autoreload
%autoreload 2

import torch

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE, SymNCO, PPO, POMO
from rl4co.utils.trainer import RL4COTrainer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
env = EVRPTWEnv(generator_params={'num_loc': 10, 
                                  'num_station': 3,
                                  'vehicle_limit': 3,
                                  'vehicle_speed': 5,
                                  'max_time': 0.9,
                                  'horizon': 1.2,
                                  'fuel_consumption_rate': 0.2,
                                  'inverse_recharge_rate': 0.2})

In [47]:
# Policy: neural network, in this case with encoder-decoder architecture
policy = AttentionModelPolicy(env_name=env.name,
                              embed_dim=256,
                              num_encoder_layers=3,
                              num_heads=8,
                              )

# RL Model: REINFORCE and greedy rollout baseline
# model = REINFORCE(env,
#                   policy,
#                   baseline="rollout",
#                   batch_size=512,
#                   train_data_size=500_000,
#                   val_data_size=100_000,
#                   optimizer_kwargs={"lr": 1e-4},
#                   )

# model = PPO(env,
#             policy,
#             # baseline="rollout",
#             batch_size=512,
#             train_data_size=200_000,
#             val_data_size=100_000,
#             optimizer_kwargs={"lr": 1e-4},
#             )

model = POMO(env,
            policy,
            # baseline="rollout",
            batch_size=512,
            train_data_size=100_000,
            val_data_size=10_000,
            optimizer_kwargs={"lr": 1e-4},
            )

In [48]:
# Greedy rollouts over untrained policy
device = torch.device("cuda")
# device = torch.device("cpu")
td_init = env.reset(batch_size=[100]).to(device)

policy = policy.to(device)
out = policy(td_init.clone(), env=env, phase="test", decode_type="greedy", return_actions=True)

actions_untrained = out['actions'].cpu().numpy()
rewards_untrained = out['reward'].cpu().numpy()

for i in range(td_init.batch_size[0]):
    print(f"Problem {i+1} | Cost: {-rewards_untrained[i]:3f}")
    # env.render(td_init[i], actions_untrained[i])
    print(actions_untrained[i])

Problem 1 | Cost: 4.829708
[ 2  7  1 10  6  5  9  0  3  8  4  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
Problem 2 | Cost: 15.444407
[10  2  6  5  3  0  7  1  9  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
Problem 3 | Cost: 6.450588
[ 6  7  9  5  1  0  2  3  4  0  8  0 10  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
Problem 4 | Cost: 27.571224
[ 4  1  2  0  6  5  3  9  0 11 10  8  0 11  0 11  0 11  0 11  0 11  0 11
  0 11  0 11  0 11  0 11  0 11  0 11]
Problem 5 | Cost: 5.629992
[ 6  8  1  9  7  0  4  5  3 10  2  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
Problem 6 | Cost: 5.914991
[ 4  8  9  6  1 10  7  0  2  3  0  5  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
Problem 7 | Cost: 4.970345
[ 5  6  1  3  7  2  0  8  9  4  0 10  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
Problem 8 | Cost: 6.100320
[ 2  

In [49]:
trainer = RL4COTrainer(
    max_epochs=30,
    accelerator="gpu",
    devices=1,
    logger=None,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [50]:
trainer.fit(model)

val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params
--------------------------------------------------
0 | env      | EVRPTWEnv            | 0     
1 | policy   | AttentionModelPolicy | 2.0 M 
2 | baseline | SharedBaseline       | 0     
--------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.914     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


In [54]:
# Greedy rollouts over trained model (same states as previous plot)
policy = model.policy.to(device)
out = policy(td_init.clone(), phase="test", decode_type="greedy", return_actions=True)
actions_trained = out['actions'].cpu().detach()
rewards_trained = out['reward'].cpu().detach()
# Plotting
import matplotlib.pyplot as plt
for i, td in enumerate(td_init):
    # fig, axs = plt.subplots(1,2, figsize=(11,5))
    print(f"Problem {i+1} | Untrained Cost: {-rewards_untrained[i]:.3f} | Trained Cost: {-rewards_trained[i]:.3f}")

    # env.render(td, actions_untrained[i], ax=axs[0])
    # env.render(td, actions_trained[i], ax=axs[1])
    # axs[0].set_title(f"Untrained | Cost = {-rewards_untrained[i].item():.3f}")
    # axs[1].set_title(r"Trained $\pi_\theta$" + f"| Cost = {-out['reward'][i].item():.3f}")

Problem 1 | Untrained Cost: 4.830 | Trained Cost: 3.592
Problem 2 | Untrained Cost: 15.444 | Trained Cost: 14.028
Problem 3 | Untrained Cost: 6.451 | Trained Cost: 4.617
Problem 4 | Untrained Cost: 27.571 | Trained Cost: 4.936
Problem 5 | Untrained Cost: 5.630 | Trained Cost: 4.304
Problem 6 | Untrained Cost: 5.915 | Trained Cost: 4.404
Problem 7 | Untrained Cost: 4.970 | Trained Cost: 3.891
Problem 8 | Untrained Cost: 6.100 | Trained Cost: 3.867
Problem 9 | Untrained Cost: 5.044 | Trained Cost: 3.904
Problem 10 | Untrained Cost: 8.841 | Trained Cost: 6.116
Problem 11 | Untrained Cost: 6.001 | Trained Cost: 4.208
Problem 12 | Untrained Cost: 4.815 | Trained Cost: 3.078
Problem 13 | Untrained Cost: 5.127 | Trained Cost: 4.073
Problem 14 | Untrained Cost: 6.687 | Trained Cost: 4.430
Problem 15 | Untrained Cost: 21.255 | Trained Cost: 15.939
Problem 16 | Untrained Cost: 7.447 | Trained Cost: 3.725
Problem 17 | Untrained Cost: 22.003 | Trained Cost: 3.648
Problem 18 | Untrained Cost: 36.93

In [None]:
actions_trained