In [4]:
%load_ext autoreload
%autoreload 2

import torch

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE, SymNCO, PPO, POMO
from rl4co.utils.trainer import RL4COTrainer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
# Greedy rollouts over untrained policy
device = torch.device("cuda")
import numpy as np

def enforce_reproducibility(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
enforce_reproducibility(0)


env_10 = EVRPTWEnv(generator_params={'num_loc': 10, 
                                    'num_station': 3,
                                    'vehicle_limit': 3,
                                    'vehicle_speed': 5,
                                    'max_time': 0.9,
                                    'horizon': 1.1,
                                    'fuel_consumption_rate': 0.2,
                                    'inverse_recharge_rate': 0.2})

env_20 = EVRPTWEnv(generator_params={'num_loc': 20, 
                                  'num_station': 3,
                                  'vehicle_limit': 3,
                                  'vehicle_speed': 5,
                                  'max_time': 0.9,
                                  'horizon': 1.1,
                                  'fuel_consumption_rate': 0.2,
                                  'inverse_recharge_rate': 0.2})

env_50 = EVRPTWEnv(generator_params={'num_loc': 50, 
                                  'num_station': 6,
                                  'vehicle_limit': 6,
                                  'vehicle_speed': 5,
                                  'max_time': 0.9,
                                  'horizon': 1.2,
                                  'fuel_consumption_rate': 0.2,
                                  'inverse_recharge_rate': 0.2})

env_100 = EVRPTWEnv(generator_params={'num_loc': 100,
                                    'num_station': 12,
                                    'vehicle_limit': 12,
                                    'vehicle_speed': 5,
                                    'max_time': 0.9,
                                    'horizon': 1.2,
                                    'fuel_consumption_rate': 0.2,
                                    'inverse_recharge_rate': 0.2})

td_10_TEST = env_10.reset(batch_size=[100]).to(device)
td_20_TEST = env_20.reset(batch_size=[100]).to(device)
td_50_TEST = env_50.reset(batch_size=[100]).to(device)
td_100_TEST = env_100.reset(batch_size=[100]).to(device)

NameError: name 'torch' is not defined

In [6]:
# Policy: neural network, in this case with encoder-decoder architecture
policy = AttentionModelPolicy(env_name=env_10.name,
                              embed_dim=256,
                              num_encoder_layers=3,
                              num_heads=8,)

model_10 = POMO(env_10,
                policy,
                # baseline="rollout",
                batch_size=512,
                train_data_size=512 * 200,
                val_data_size=512 * 50,
                optimizer_kwargs={"lr": 1e-4,
                                  "weight_decay": 1e-6})

C:\Users\shanh\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\utilities\parsing.py:199: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
C:\Users\shanh\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\utilities\parsing.py:199: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


In [7]:
# Greedy rollouts over untrained policy
device = torch.device("cuda")

policy = policy.to(device)
out_10 = policy(td_10_TEST.clone(), env=env_10, phase="test", decode_type="greedy", return_actions=True)
out_20 = policy(td_20_TEST.clone(), env=env_20, phase="test", decode_type="greedy", return_actions=True)
out_50 = policy(td_50_TEST.clone(), env=env_50, phase="test", decode_type="greedy", return_actions=True)
out_100 = policy(td_100_TEST.clone(), env=env_100, phase="test", decode_type="greedy", return_actions=True)

rewards_untrained_10 = out_10['reward'].cpu().numpy()
rewards_untrained_20 = out_20['reward'].cpu().numpy()
rewards_untrained_50 = out_50['reward'].cpu().numpy()
rewards_untrained_100 = out_100['reward'].cpu().numpy()
rewards = [rewards_untrained_10, rewards_untrained_20, rewards_untrained_50, rewards_untrained_100]

scale = [10, 20, 50, 100]
for i in range(4):
    print(f"Scale: {scale[i]} | Mean Untrained Test Cost: {-rewards[i].mean():3f}")

  out = self.sdpa_fn(


Scale: 10 | Mean Untrained Test Cost: 13.126696
Scale: 20 | Mean Untrained Test Cost: 27.022886
Scale: 50 | Mean Untrained Test Cost: 52.758595
Scale: 100 | Mean Untrained Test Cost: 94.450882


In [8]:
del out_10, out_20, out_50, out_100
torch.cuda.empty_cache()

In [9]:
trainer = RL4COTrainer(
    max_epochs=10,
    accelerator="gpu",
    devices=1,
    logger=None,
)
trainer.fit(model_10)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params
--------------------------------------------------
0 | env      | EVRPTWEnv            | 0     
1 | policy   | AttentionModelPolicy | 2.0 M 
2 | baseline | SharedBaseline       | 0     
--------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.914     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\shanh\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
C:\Users\shanh\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [10]:
# Greedy rollouts over untrained policy
device = torch.device("cuda")

policy = policy.to(device)
out_10 = policy(td_10_TEST.clone(), env=env_10, phase="test", decode_type="greedy", return_actions=True)
out_20 = policy(td_20_TEST.clone(), env=env_20, phase="test", decode_type="greedy", return_actions=True)
out_50 = policy(td_50_TEST.clone(), env=env_50, phase="test", decode_type="greedy", return_actions=True)
out_100 = policy(td_100_TEST.clone(), env=env_100, phase="test", decode_type="greedy", return_actions=True)

rewards_trained_10 = out_10['reward'].cpu().numpy()
rewards_trained_20 = out_20['reward'].cpu().numpy()
rewards_trained_50 = out_50['reward'].cpu().numpy()
rewards_trained_100 = out_100['reward'].cpu().numpy()
rewards_trained = [rewards_trained_10, rewards_trained_20, rewards_trained_50, rewards_trained_100]

scale = [10, 20, 50, 100]
print("Trained 10 Epochs with Environment of C=10, S=3, EV=3")
for i in range(4):
    print(f"Scale: {scale[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

Trained 10 Epochs with Environment of C=10, S=3, EV=3
Scale: 10 | Mean Trained Test Cost: 5.365315
Scale: 20 | Mean Trained Test Cost: 9.085166
Scale: 50 | Mean Trained Test Cost: 20.450037
Scale: 100 | Mean Trained Test Cost: 38.277969


In [11]:
del out_10, out_20, out_50, out_100
torch.cuda.empty_cache()

In [12]:
trainer_STEP2 = RL4COTrainer(
    max_epochs=10,
    accelerator="gpu",
    devices=1,
    logger=None,
)
trainer_STEP2.fit(model_10)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params
--------------------------------------------------
0 | env      | EVRPTWEnv            | 0     
1 | policy   | AttentionModelPolicy | 2.0 M 
2 | baseline | SharedBaseline       | 0     
--------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.914     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [13]:
device = torch.device("cuda")

policy = policy.to(device)
out_10 = policy(td_10_TEST.clone(), env=env_10, phase="test", decode_type="greedy", return_actions=True)
out_20 = policy(td_20_TEST.clone(), env=env_20, phase="test", decode_type="greedy", return_actions=True)
out_50 = policy(td_50_TEST.clone(), env=env_50, phase="test", decode_type="greedy", return_actions=True)
out_100 = policy(td_100_TEST.clone(), env=env_100, phase="test", decode_type="greedy", return_actions=True)

rewards_trained_10 = out_10['reward'].cpu().numpy()
rewards_trained_20 = out_20['reward'].cpu().numpy()
rewards_trained_50 = out_50['reward'].cpu().numpy()
rewards_trained_100 = out_100['reward'].cpu().numpy()
rewards_trained = [rewards_trained_10, rewards_trained_20, rewards_trained_50, rewards_trained_100]

scale = [10, 20, 50, 100]
print("Trained 20 Epochs with Environment of C=10, S=3, EV=3")
for i in range(4):
    print(f"Scale: {scale[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

Trained 20 Epochs with Environment of C=10, S=3, EV=3
Scale: 10 | Mean Trained Test Cost: 5.322504
Scale: 20 | Mean Trained Test Cost: 9.078220
Scale: 50 | Mean Trained Test Cost: 20.457903
Scale: 100 | Mean Trained Test Cost: 38.042450


In [14]:
del out_10, out_20, out_50, out_100
torch.cuda.empty_cache()

In [15]:
trainer_STEP3 = RL4COTrainer(
    max_epochs=10,
    accelerator="gpu",
    devices=1,
    logger=None,
)
trainer_STEP3.fit(model_10)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params
--------------------------------------------------
0 | env      | EVRPTWEnv            | 0     
1 | policy   | AttentionModelPolicy | 2.0 M 
2 | baseline | SharedBaseline       | 0     
--------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.914     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [18]:
# Greedy rollouts over untrained policy
device = torch.device("cuda")

policy = policy.to(device)
out_10 = policy(td_10_TEST.clone(), env=env_10, phase="test", decode_type="greedy", return_actions=True)
out_20 = policy(td_20_TEST.clone(), env=env_20, phase="test", decode_type="greedy", return_actions=True)
out_50 = policy(td_50_TEST.clone(), env=env_50, phase="test", decode_type="greedy", return_actions=True)
out_100 = policy(td_100_TEST.clone(), env=env_100, phase="test", decode_type="greedy", return_actions=True)

rewards_trained_10 = out_10['reward'].cpu().numpy()
rewards_trained_20 = out_20['reward'].cpu().numpy()
rewards_trained_50 = out_50['reward'].cpu().numpy()
rewards_trained_100 = out_100['reward'].cpu().numpy()
rewards_trained = [rewards_trained_10, rewards_trained_20, rewards_trained_50, rewards_trained_100]

scale = [10, 20, 50, 100]
print("Trained 30 Epochs with Environment of C=10, S=3, EV=3")
for i in range(4):
    print(f"Scale: {scale[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

Trained 30 Epochs with Environment of C=10, S=3, EV=3
Scale: 10 | Mean Trained Test Cost: 5.310370
Scale: 20 | Mean Trained Test Cost: 8.988823
Scale: 50 | Mean Trained Test Cost: 20.321991
Scale: 100 | Mean Trained Test Cost: 37.930580


In [19]:
del out_10, out_20, out_50, out_100
torch.cuda.empty_cache()

In [20]:
trainer_STEP4 = RL4COTrainer(
    max_epochs=100,
    accelerator="gpu",
    devices=1,
    logger=None,
)
trainer_STEP4.fit(model_10)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params
--------------------------------------------------
0 | env      | EVRPTWEnv            | 0     
1 | policy   | AttentionModelPolicy | 2.0 M 
2 | baseline | SharedBaseline       | 0     
--------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.914     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

C:\Users\shanh\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [21]:
# Greedy rollouts over untrained policy
device = torch.device("cuda")

policy = policy.to(device)
out_10 = policy(td_10_TEST.clone(), env=env_10, phase="test", decode_type="greedy", return_actions=True)
out_20 = policy(td_20_TEST.clone(), env=env_20, phase="test", decode_type="greedy", return_actions=True)
out_50 = policy(td_50_TEST.clone(), env=env_50, phase="test", decode_type="greedy", return_actions=True)
out_100 = policy(td_100_TEST.clone(), env=env_100, phase="test", decode_type="greedy", return_actions=True)

rewards_trained_10 = out_10['reward'].cpu().numpy()
rewards_trained_20 = out_20['reward'].cpu().numpy()
rewards_trained_50 = out_50['reward'].cpu().numpy()
rewards_trained_100 = out_100['reward'].cpu().numpy()
rewards_trained = [rewards_trained_10, rewards_trained_20, rewards_trained_50, rewards_trained_100]

scale = [10, 20, 50, 100]
print("Trained 45 Epochs with Environment of C=10, S=3, EV=3")
for i in range(4):
    print(f"Scale: {scale[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

Trained 45 Epochs with Environment of C=10, S=3, EV=3
Scale: 10 | Mean Trained Test Cost: 5.273592
Scale: 20 | Mean Trained Test Cost: 9.000785
Scale: 50 | Mean Trained Test Cost: 20.040207
Scale: 100 | Mean Trained Test Cost: 37.399662
