In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE, SymNCO, PPO, POMO, RewardConstrainedPOMO
from rl4co.utils.trainer import RL4COTrainer
from rl4co.utils.callbacks.reward_check import RewardLoggingCallback, get_reward_and_check

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import rl4co
print(rl4co.__file__)

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\rl4co\__init__.py


In [3]:
import sys
# sys.path.remove(r"c:\users\hyosi\onedrive\ut\2024 fall\mie1666\project\code\rl4evrptw\rl4co")

for path in sys.path:
    print(path)

c:\Users\hyosi\anaconda3\envs\rl4co\python311.zip
c:\Users\hyosi\anaconda3\envs\rl4co\DLLs
c:\Users\hyosi\anaconda3\envs\rl4co\Lib
c:\Users\hyosi\anaconda3\envs\rl4co

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32\lib
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\Pythonwin
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\setuptools\_vendor


In [4]:
device = torch.device("cuda")
def enforce_reproducibility(seed):
    import random
    import os 
    os.environ['PYTHONHASHSEED'] = str(seed)
    # NVIDIA's CUDA Basic Linear Algebra Subroutines library
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
vehicle_capacity = 1.25

# [num_loc, num_station, num_ev]
settings =[[10, 3, 3], [20, 3, 3], [50, 6, 6], [100, 12,12]]
hard_envs = []
td_tests = []   # Hard env setting for test (cf. get_action_mask() is different)
for num_loc, num_station, num_ev in settings:
    enforce_reproducibility(0)
    env = EVRPTWEnv(generator_params={'num_loc': num_loc, 
                                        'num_station': num_station,
                                        'vehicle_limit': num_ev,
                                        'vehicle_speed': 5,
                                        'vehicle_capacity': vehicle_capacity,
                                        'max_time': 1,
                                        'horizon': 1,
                                        'fuel_consumption_rate': 0.25,
                                        'inverse_recharge_rate': 0.25})
    hard_envs.append(env)
    td_init = env.reset(batch_size=[100]).to(device)
    td_tests.append(td_init)

soft_envs = []
for num_loc, num_station, num_ev in settings:
    enforce_reproducibility(0)
    env = EVRPTWEnv(generator_params={'num_loc': num_loc, 
                                        'num_station': num_station,
                                        'vehicle_limit': num_ev,
                                        'vehicle_speed': 5,
                                        'vehicle_capacity': vehicle_capacity,
                                        'max_time': 1,
                                        'horizon': 1,
                                        'fuel_consumption_rate': 0.25,
                                        'inverse_recharge_rate': 0.25})
    env.soft = True ## Soft setting
    soft_envs.append(env)




In [5]:
MAX_EPOCH = 50
BATCH_SIZE = 512
TRAIN_DATA_SIZE = BATCH_SIZE * 200
VAL_DATA_SIZE = BATCH_SIZE * 50
# MAX_EPOCH = 2
# BATCH_SIZE = 10
# TRAIN_DATA_SIZE = BATCH_SIZE * 1
# VAL_DATA_SIZE = BATCH_SIZE * 1

# POMO
policy1 = AttentionModelPolicy(env_name=soft_envs[0].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy2 = AttentionModelPolicy(env_name=soft_envs[1].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy5 = AttentionModelPolicy(env_name=soft_envs[2].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_10 = POMO(soft_envs[0],
                policy1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

model_20 = POMO(soft_envs[1],
                policy2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

model_50 = POMO(soft_envs[2],
                policy5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


# RCPOMO
policy_c1 = AttentionModelPolicy(env_name=soft_envs[0].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c2 = AttentionModelPolicy(env_name=soft_envs[1].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c5 = AttentionModelPolicy(env_name=soft_envs[2].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_c10 = RewardConstrainedPOMO(soft_envs[0],
                policy_c1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

model_c20 = RewardConstrainedPOMO(soft_envs[1],
                policy_c2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

model_c50 = RewardConstrainedPOMO(soft_envs[2],
                policy_c5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

# RINFORCE
policy_r1 = AttentionModelPolicy(env_name=soft_envs[0].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_r2 = AttentionModelPolicy(env_name=soft_envs[1].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_r5 = AttentionModelPolicy(env_name=soft_envs[2].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_r10 = REINFORCE(soft_envs[0],
                policy_r1,
                baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

model_r20 = REINFORCE(soft_envs[1],
                policy_r2,
                baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

model_r50 = REINFORCE(soft_envs[2],
                policy_r5,
                baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


In [6]:
scale = [10, 20, 50, 100]

# POMO
trainer_STEP1 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy1.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_POMO_C10"
        )
    ]
)
trainer_STEP1.fit(model_10)

# RCPOMO
trainer_C_STEP1 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c1.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_RCPOMO_C10"
        )
    ]
)
trainer_C_STEP1.fit(model_c10)

# REINFORCE
trainer_R_STEP1 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r1.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_REINFORCE_C10"
        )
    ]
)
trainer_R_STEP1.fit(model_r10)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 200/200 [00:22<00:00,  8.72it/s, v_num=540, train/reward=-3.41, train/loss=-0.17, val/reward=-3.34]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:19<00:00, 10.14it/s, v_num=540, train/reward=-3.33, train/loss=-0.12, val/reward=-3.30] Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:19<00:00, 10.27it/s, v_num=540, train/reward=-3.30, train/loss=-0.10, val/reward=-3.29]  Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:20<00:00,  9.87it/s, v_num=540, train/reward=-3.30, train/loss=-0.10, val/reward=-3.29]

  epoch_data[f"C{s}_mean_reward"] = -rewards_trained[i].mean()
  ret = ret.dtype.type(ret / rcount)


Epoch 3: 100%|██████████| 200/200 [00:19<00:00, 10.36it/s, v_num=540, train/reward=-3.31, train/loss=-0.0815, val/reward=-3.29]Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:19<00:00, 10.38it/s, v_num=540, train/reward=-3.31, train/loss=-0.0955, val/reward=-3.28]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:19<00:00, 10.38it/s, v_num=540, train/reward=-3.31, train/loss=-0.0768, val/reward=-3.28]Callback is called
Callback is finished
Epoch 6: 100%|██████████| 200/200 [00:19<00:00, 10.35it/s, v_num=540, train/reward=-3.28, train/loss=-0.0702, val/reward=-3.28]Callback is called
Callback is finished
Epoch 7: 100%|██████████| 200/200 [00:19<00:00, 10.36it/s, v_num=540, train/reward=-3.30, train/loss=-0.0661, val/reward=-3.28]Callback is called
Callback is finished
Epoch 8: 100%|██████████| 200/200 [00:19<00:00, 10.35it/s, v_num=540, train/reward=-3.30, train/loss=-0.0707, val/reward=-3.28]Callback is called
Callback is finishe

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:19<00:00, 10.31it/s, v_num=540, train/reward=-3.25, train/loss=-0.0362, val/reward=-3.26]

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs





val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:24<00:00,  8.25it/s, v_num=541, train/reward=-3.83, train/loss=-0.369, val/reward=-3.66]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:23<00:00,  8.47it/s, v_num=541, train/reward=-3.92, train/loss=-0.312, val/reward=-3.83]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:23<00:00,  8.59it/s, v_num=541, train/reward=-4.30, train/loss=-1.37, val/reward=-3.77] Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:25<00:00,  8.00it/s, v_num=541, train/reward=-4.12, train/loss=-1.31, val/reward=-3.77] Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:24<00:00,  8.24it/s, v_num=541, train/reward=-3.70, train/loss=-0.223, val/reward=-3.68]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:22<00:00,  8.73it/s, v_num=541, train/reward=-3.95, train/loss=-0.667, val/reward=-3.63]Callback is called
Callback is finished
Epoc

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:21<00:00,  9.38it/s, v_num=541, train/reward=-3.55, train/loss=-0.0907, val/reward=-3.48]

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs





val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:18<00:00, 10.93it/s, v_num=542, train/reward=-3.26, train/loss=-0.144, val/reward=-3.15]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:15<00:00, 13.32it/s, v_num=542, train/reward=-3.11, train/loss=-0.00957, val/reward=-3.07]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:15<00:00, 12.97it/s, v_num=542, train/reward=-3.05, train/loss=-0.116, val/reward=-3.06]  Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:15<00:00, 13.07it/s, v_num=542, train/reward=-3.01, train/loss=-0.0328, val/reward=-3.04]Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:15<00:00, 13.10it/s, v_num=542, train/reward=-3.01, train/loss=-0.0396, val/reward=-3.04]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:14<00:00, 14.21it/s, v_num=542, train/reward=-3.07, train/loss=-0.0815, val/reward=-3.04]Callback is called
Callback is finish

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:16<00:00, 12.35it/s, v_num=542, train/reward=-3.00, train/loss=-0.0387, val/reward=-3.01]


In [7]:
policy1 = policy1.to(device)
rewards_trained, num_valid = get_reward_and_check(policy1, td_tests, hard_envs)
# print(rewards_trained)
print("POMO: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c1 = policy_c1.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c1, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r1 = policy_r1.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r1, td_tests, hard_envs)
print("\nREINFORCE: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 81 | Mean Trained Test Cost: 5.261562
Scale: 20 | FeasibleCounts: 19 | Mean Trained Test Cost: 7.248013
Scale: 50 | FeasibleCounts: 12 | Mean Trained Test Cost: 14.381671
Scale: 100 | FeasibleCounts: 42 | Mean Trained Test Cost: 27.189011

RCPOMO: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 88 | Mean Trained Test Cost: 6.049044
Scale: 20 | FeasibleCounts: 33 | Mean Trained Test Cost: 7.983972
Scale: 50 | FeasibleCounts: 36 | Mean Trained Test Cost: 14.360809
Scale: 100 | FeasibleCounts: 59 | Mean Trained Test Cost: 27.460426

REINFORCE: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 89 | Mean Trained Test Cost: 4.886317
Scale: 20 | FeasibleCounts: 32 | Mean Trained Test Cost: 6.861988
Scale: 50 | FeasibleCounts: 23 | Mean Trained Test Cost: 13.610734
Scale: 100 | FeasibleCounts: 59 | Mean Trained Test Cost: 24.440115


In [8]:
del trainer_STEP1, trainer_C_STEP1, trainer_R_STEP1
del rewards_trained, rewards_c_trained, rewards_r_trained, num_valid, num_c_valid, num_r_valid
torch.cuda.empty_cache()

In [9]:

# POMO
trainer_STEP2 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy2.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_POMO_C20"
        )
    ]
)
trainer_STEP2.fit(model_20)

# RCPOMO
trainer_C_STEP2 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c2.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_RCPOMO_C20"
        )
    ]
)
trainer_C_STEP2.fit(model_c20)

# REINFORCE
trainer_R_STEP2 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r2.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_REINFORCE_C20"
        )
    ]
)
trainer_R_STEP2.fit(model_r20)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s, v_num=543, train/reward=-4.70, train/loss=-0.54, val/reward=-4.56]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:33<00:00,  6.04it/s, v_num=543, train/reward=-4.54, train/loss=-0.312, val/reward=-4.46]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:32<00:00,  6.09it/s, v_num=543, train/reward=-4.47, train/loss=-0.267, val/reward=-4.43]Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:32<00:00,  6.15it/s, v_num=543, train/reward=-4.45, train/loss=-0.251, val/reward=-4.42]Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:32<00:00,  6.08it/s, v_num=543, train/reward=-4.46, train/loss=-0.257, val/reward=-4.41]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:32<00:00,  6.16it/s, v_num=543, train/reward=-4.41, train/loss=-0.204, val/reward=-4.40]Callback is called
Callback is finished
Epoch

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:33<00:00,  6.06it/s, v_num=543, train/reward=-4.36, train/loss=-0.121, val/reward=-4.33]

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True





TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:37<00:00,  5.35it/s, v_num=544, train/reward=-6.77, train/loss=-3.09, val/reward=-6.25]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:36<00:00,  5.44it/s, v_num=544, train/reward=-6.73, train/loss=-2.51, val/reward=-6.11]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:34<00:00,  5.78it/s, v_num=544, train/reward=-6.32, train/loss=-1.53, val/reward=-5.92] Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:34<00:00,  5.86it/s, v_num=544, train/reward=-6.07, train/loss=-0.952, val/reward=-5.85]Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:34<00:00,  5.86it/s, v_num=544, train/reward=-5.93, train/loss=-0.824, val/reward=-5.66]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:34<00:00,  5.82it/s, v_num=544, train/reward=-5.90, train/loss=-0.867, val/reward=-5.70]Callback is called
Callback is finished
Epoch 

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:35<00:00,  5.64it/s, v_num=544, train/reward=-5.35, train/loss=-0.312, val/reward=-5.28]

Using 16bit Automatic Mixed Precision (AMP)





GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:26<00:00,  7.42it/s, v_num=545, train/reward=-4.99, train/loss=19.50, val/reward=-4.64]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:24<00:00,  8.26it/s, v_num=545, train/reward=-4.54, train/loss=-0.127, val/reward=-4.35]  Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:23<00:00,  8.38it/s, v_num=545, train/reward=-4.36, train/loss=1.300, val/reward=-4.24]     Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:23<00:00,  8.58it/s, v_num=545, train/reward=-4.28, train/loss=-0.559, val/reward=-4.16]  Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:23<00:00,  8.59it/s, v_num=545, train/reward=-4.20, train/loss=-0.336, val/reward=-4.14]  Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:23<00:00,  8.59it/s, v_num=545, train/reward=-4.16, train/loss=-0.368, val/reward=-4.12] Callback is called
Callback is fin

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:27<00:00,  7.41it/s, v_num=545, train/reward=-4.05, train/loss=0.0999, val/reward=-4.01]


In [17]:
policy2 = policy2.to(device)
rewards_trained, num_valid = get_reward_and_check(policy2, td_tests, hard_envs)
# print(rewards_trained)
print("POMO: Trained with Environment of C=20, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c2 = policy_c2.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c2, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=20, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r2 = policy_r2.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r2, td_tests, hard_envs)
print("\nREINFORCE: Trained with Environment of C=20, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=20, S=3, EV=3
Scale: 10 | FeasibleCounts: 67 | Mean Trained Test Cost: 5.930059
Scale: 20 | FeasibleCounts: 10 | Mean Trained Test Cost: 8.054955
Scale: 50 | FeasibleCounts: 1 | Mean Trained Test Cost: 14.269993
Scale: 100 | FeasibleCounts: 4 | Mean Trained Test Cost: 31.162970

RCPOMO: Trained with Environment of C=20, S=3, EV=3
Scale: 10 | FeasibleCounts: 52 | Mean Trained Test Cost: 7.404771
Scale: 20 | FeasibleCounts: 11 | Mean Trained Test Cost: 8.119093
Scale: 50 | FeasibleCounts: 5 | Mean Trained Test Cost: 14.787048
Scale: 100 | FeasibleCounts: 13 | Mean Trained Test Cost: 27.750549

REINFORCE: Trained with Environment of C=20, S=3, EV=3
Scale: 10 | FeasibleCounts: 90 | Mean Trained Test Cost: 4.901956
Scale: 20 | FeasibleCounts: 29 | Mean Trained Test Cost: 6.734668
Scale: 50 | FeasibleCounts: 25 | Mean Trained Test Cost: 13.449342
Scale: 100 | FeasibleCounts: 71 | Mean Trained Test Cost: 24.103729


In [11]:
del trainer_STEP2, trainer_C_STEP2, trainer_R_STEP2
del rewards_trained, rewards_c_trained, rewards_r_trained, num_valid, num_c_valid, num_r_valid
torch.cuda.empty_cache()

In [12]:

# POMO
trainer_STEP5 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy5.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_POMO_C50"
        )
    ]
)
trainer_STEP5.fit(model_50)

# RCPOMO
trainer_C_STEP5 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c5.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_RCPOMO_C50"
        )
    ]
)
trainer_C_STEP5.fit(model_c50)

# REINFORCE
trainer_R_STEP5 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r5.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_REINFORCE_C50"
        )
    ]
)
trainer_R_STEP5.fit(model_r50)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [01:48<00:00,  1.84it/s, v_num=546, train/reward=-7.85, train/loss=-1.48, val/reward=-7.37]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [01:41<00:00,  1.98it/s, v_num=546, train/reward=-7.36, train/loss=-1.56, val/reward=-6.99]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [01:46<00:00,  1.87it/s, v_num=546, train/reward=-7.11, train/loss=-0.977, val/reward=-6.84]Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [01:44<00:00,  1.91it/s, v_num=546, train/reward=-7.04, train/loss=-0.99, val/reward=-6.80] Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [01:45<00:00,  1.89it/s, v_num=546, train/reward=-7.07, train/loss=-1.31, val/reward=-6.80] Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [01:41<00:00,  1.97it/s, v_num=546, train/reward=-7.25, train/loss=-1.68, val/reward=-6.86]Callback is called
Callback is finished
Epoch 6

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [01:39<00:00,  2.00it/s, v_num=546, train/reward=-8.21, train/loss=-0.973, val/reward=-7.90]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [01:53<00:00,  1.77it/s, v_num=547, train/reward=-15.0, train/loss=-5.55, val/reward=-12.9]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [01:45<00:00,  1.89it/s, v_num=547, train/reward=-12.7, train/loss=-3.45, val/reward=-11.9]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [01:48<00:00,  1.84it/s, v_num=547, train/reward=-12.3, train/loss=-3.05, val/reward=-11.7]Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [01:46<00:00,  1.88it/s, v_num=547, train/reward=-12.0, train/loss=-2.52, val/reward=-11.5]Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [01:46<00:00,  1.88it/s, v_num=547, train/reward=-12.3, train/loss=-4.04, val/reward=-11.5]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [01:45<00:00,  1.89it/s, v_num=547, train/reward=-12.3, train/loss=-5.68, val/reward=-11.3]Callback is called
Callback is finished
Epoch 6: 1

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [01:47<00:00,  1.87it/s, v_num=547, train/reward=-11.1, train/loss=-1.84, val/reward=-10.8]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [01:07<00:00,  2.98it/s, v_num=548, train/reward=-8.95, train/loss=-1.93, val/reward=-7.95]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:51<00:00,  3.87it/s, v_num=548, train/reward=-8.47, train/loss=106.0, val/reward=-7.82]  Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:51<00:00,  3.88it/s, v_num=548, train/reward=-7.56, train/loss=5.960, val/reward=-7.28]  Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:52<00:00,  3.83it/s, v_num=548, train/reward=-7.52, train/loss=6.470, val/reward=-7.26]   Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:51<00:00,  3.87it/s, v_num=548, train/reward=-8.21, train/loss=-9.10, val/reward=-7.45]   Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:51<00:00,  3.89it/s, v_num=548, train/reward=-8.34, train/loss=-11.2, val/reward=-7.57]   Callback is called
Callback is finish

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:58<00:00,  3.42it/s, v_num=548, train/reward=-27.1, train/loss=-2.88e+3, val/reward=-26.6]


In [13]:
policy5 = policy5.to(device)
rewards_trained, num_valid = get_reward_and_check(policy5, td_tests, hard_envs)
# print(rewards_trained)
print("POMO: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c5 = policy_c5.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c5, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r5 = policy_r5.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r5, td_tests, hard_envs)
print("\nREINFORCE: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 86 | Mean Trained Test Cost: 5.334347
Scale: 20 | FeasibleCounts: 31 | Mean Trained Test Cost: 7.353879
Scale: 50 | FeasibleCounts: 46 | Mean Trained Test Cost: 15.097116
Scale: 100 | FeasibleCounts: 86 | Mean Trained Test Cost: 25.306658

RCPOMO: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 94 | Mean Trained Test Cost: 5.148530
Scale: 20 | FeasibleCounts: 61 | Mean Trained Test Cost: 6.982335
Scale: 50 | FeasibleCounts: 60 | Mean Trained Test Cost: 13.203002
Scale: 100 | FeasibleCounts: 60 | Mean Trained Test Cost: 26.687469

REINFORCE: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 75 | Mean Trained Test Cost: 6.496563
Scale: 20 | FeasibleCounts: 4 | Mean Trained Test Cost: 9.495222
Scale: 50 | FeasibleCounts: 0 | Mean Trained Test Cost: nan
Scale: 100 | FeasibleCounts: 0 | Mean Trained Test Cost: nan


  print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")


In [14]:
MAX_EPOCH = 50
BATCH_SIZE_100 = 128
TRAIN_DATA_SIZE_100 = BATCH_SIZE_100 * 200
VAL_DATA_SIZE_100 = BATCH_SIZE_100 * 50

# POMO
policy100 = AttentionModelPolicy(env_name=soft_envs[3].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_100 = POMO(soft_envs[3],
                policy100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


# RCPOMO
policy_c100 = AttentionModelPolicy(env_name=soft_envs[3].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_c100 = RewardConstrainedPOMO(soft_envs[3],
                policy_c100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


# REINFORCE
policy_r100 = AttentionModelPolicy(env_name=soft_envs[3].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_r100 = REINFORCE(soft_envs[3],
                policy_r100,
                baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

In [15]:
# POMO
trainer_STEP100 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy100.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_POMO_C100"
        )
    ]
)
trainer_STEP100.fit(model_100)

# RCPOMO
trainer_C_STEP100 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c100.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_RCPOMO_C100"
        )
    ]
)
trainer_C_STEP100.fit(model_c100)

# REINFORCE
trainer_R_STEP100 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r100.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="SOFT_REINFORCE_C100"
        )
    ]
)
trainer_R_STEP100.fit(model_r100)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [02:52<00:00,  1.16it/s, v_num=549, train/reward=-11.5, train/loss=-2.93, val/reward=-10.8]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [02:52<00:00,  1.16it/s, v_num=549, train/reward=-11.1, train/loss=-1.99, val/reward=-10.5]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [03:00<00:00,  1.11it/s, v_num=549, train/reward=-10.7, train/loss=-2.37, val/reward=-10.3]Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [03:10<00:00,  1.05it/s, v_num=549, train/reward=-10.5, train/loss=-1.49, val/reward=-10.1]Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [03:12<00:00,  1.04it/s, v_num=549, train/reward=-10.3, train/loss=-1.30, val/reward=-10.0]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [03:10<00:00,  1.05it/s, v_num=549, train/reward=-10.1, train/loss=-1.14, val/reward=-9.88]Callback is called
Callback is finished
Epoch 6: 1

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [03:13<00:00,  1.03it/s, v_num=549, train/reward=-9.04, train/loss=-0.663, val/reward=-8.94]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [03:25<00:00,  0.97it/s, v_num=550, train/reward=-25.0, train/loss=-8.07, val/reward=-22.6]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [03:18<00:00,  1.01it/s, v_num=550, train/reward=-22.6, train/loss=-6.39, val/reward=-20.9]Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [03:16<00:00,  1.02it/s, v_num=550, train/reward=-23.0, train/loss=-6.26, val/reward=-21.9]Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [03:17<00:00,  1.01it/s, v_num=550, train/reward=-21.4, train/loss=-4.23, val/reward=-20.6]Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [03:18<00:00,  1.01it/s, v_num=550, train/reward=-20.5, train/loss=-3.92, val/reward=-20.1]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [03:16<00:00,  1.02it/s, v_num=550, train/reward=-20.7, train/loss=-2.97, val/reward=-20.1]Callback is called
Callback is finished
Epoch 6: 1

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [03:16<00:00,  1.02it/s, v_num=550, train/reward=-17.8, train/loss=-1.42, val/reward=-17.7]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [01:44<00:00,  1.92it/s, v_num=551, train/reward=-14.4, train/loss=20.70, val/reward=-11.9]Callback is called
Callback is finished
Epoch 1: 100%|██████████| 200/200 [01:56<00:00,  1.71it/s, v_num=551, train/reward=-76.2, train/loss=-3.22e+4, val/reward=-1.04]  Callback is called
Callback is finished
Epoch 2: 100%|██████████| 200/200 [02:02<00:00,  1.63it/s, v_num=551, train/reward=-69.0, train/loss=-2.83e+4, val/reward=-1.04]Callback is called
Callback is finished
Epoch 3: 100%|██████████| 200/200 [02:04<00:00,  1.61it/s, v_num=551, train/reward=-92.5, train/loss=-4e+4, val/reward=-1.04]   Callback is called
Callback is finished
Epoch 4: 100%|██████████| 200/200 [02:02<00:00,  1.63it/s, v_num=551, train/reward=-162., train/loss=-7.64e+4, val/reward=-1.04]Callback is called
Callback is finished
Epoch 5: 100%|██████████| 200/200 [02:03<00:00,  1.62it/s, v_num=551, train/reward=-123., train/loss=-5.66e+4, val/reward=-1.04]Callback is called
Callback is fi

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [02:06<00:00,  1.59it/s, v_num=551, train/reward=-84.1, train/loss=-3.59e+4, val/reward=-1.04]


In [16]:
policy100 = policy100.to(device)
rewards_trained, num_valid = get_reward_and_check(policy100, td_tests, hard_envs)
# print(rewards_trained)
print("POMO: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c100 = policy_c100.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c100, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r100 = policy_r100.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r100, td_tests, hard_envs)
print("\nREINFORCE: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 45 | Mean Trained Test Cost: 7.064821
Scale: 20 | FeasibleCounts: 1 | Mean Trained Test Cost: 6.989510
Scale: 50 | FeasibleCounts: 0 | Mean Trained Test Cost: nan
Scale: 100 | FeasibleCounts: 0 | Mean Trained Test Cost: nan


  print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")



RCPOMO: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 44 | Mean Trained Test Cost: 7.245407
Scale: 20 | FeasibleCounts: 1 | Mean Trained Test Cost: 9.372268
Scale: 50 | FeasibleCounts: 1 | Mean Trained Test Cost: 17.735788
Scale: 100 | FeasibleCounts: 0 | Mean Trained Test Cost: nan


  print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")



REINFORCE: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 75 | Mean Trained Test Cost: 6.496563
Scale: 20 | FeasibleCounts: 4 | Mean Trained Test Cost: 9.495222
Scale: 50 | FeasibleCounts: 0 | Mean Trained Test Cost: nan
Scale: 100 | FeasibleCounts: 0 | Mean Trained Test Cost: nan


  print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")
