In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE, SymNCO, PPO, POMO, RewardConstrainedPOMO
from rl4co.utils.trainer import RL4COTrainer
from rl4co.utils.callbacks.reward_check import RewardLoggingCallback, get_reward_and_check

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import rl4co
print(rl4co.__file__)

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\rl4co\__init__.py


In [3]:
import sys
# sys.path.remove(r"c:\users\hyosi\onedrive\ut\2024 fall\mie1666\project\code\rl4evrptw\rl4co")

for path in sys.path:
    print(path)

c:\Users\hyosi\anaconda3\envs\rl4co\python311.zip
c:\Users\hyosi\anaconda3\envs\rl4co\DLLs
c:\Users\hyosi\anaconda3\envs\rl4co\Lib
c:\Users\hyosi\anaconda3\envs\rl4co

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32\lib
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\Pythonwin
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\setuptools\_vendor


In [4]:
device = torch.device("cuda")
def enforce_reproducibility(seed):
    import random
    import os 
    os.environ['PYTHONHASHSEED'] = str(seed)
    # NVIDIA's CUDA Basic Linear Algebra Subroutines library
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
vehicle_capacity = 1.25

# [num_loc, num_station, num_ev]
settings =[[10, 3, 3], [20, 3, 3], [50, 6, 6], [100, 12,12]]
hard_envs = []
td_tests = []   # Hard env setting for test (cf. get_action_mask() is different)
for num_loc, num_station, num_ev in settings:
    enforce_reproducibility(0)
    env = EVRPTWEnv(generator_params={'num_loc': num_loc, 
                                        'num_station': num_station,
                                        'vehicle_limit': num_ev,
                                        'vehicle_speed': 5,
                                        'vehicle_capacity': vehicle_capacity,
                                        'max_time': 1,
                                        'horizon': 1,
                                        'fuel_consumption_rate': 0.25,
                                        'inverse_recharge_rate': 0.25})
    hard_envs.append(env)
    td_init = env.reset(batch_size=[100]).to(device)
    td_tests.append(td_init)

soft_envs = []
for num_loc, num_station, num_ev in settings:
    enforce_reproducibility(0)
    env = EVRPTWEnv(generator_params={'num_loc': num_loc, 
                                        'num_station': num_station,
                                        'vehicle_limit': num_ev,
                                        'vehicle_speed': 5,
                                        'vehicle_capacity': vehicle_capacity,
                                        'max_time': 1,
                                        'horizon': 1,
                                        'fuel_consumption_rate': 0.25,
                                        'inverse_recharge_rate': 0.25})
    env.soft = True ## Soft setting
    soft_envs.append(env)




In [5]:
MAX_EPOCH = 25
BATCH_SIZE = 512
TRAIN_DATA_SIZE = BATCH_SIZE * 200
VAL_DATA_SIZE = BATCH_SIZE * 50
# MAX_EPOCH = 2
# BATCH_SIZE = 10
# TRAIN_DATA_SIZE = BATCH_SIZE * 1
# VAL_DATA_SIZE = BATCH_SIZE * 1

# RCPOMO
policy_c1 = AttentionModelPolicy(env_name=soft_envs[0].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c2 = AttentionModelPolicy(env_name=soft_envs[1].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c5 = AttentionModelPolicy(env_name=soft_envs[2].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

soft_model_c10 = RewardConstrainedPOMO(soft_envs[0],
                policy_c1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

soft_model_c20 = RewardConstrainedPOMO(soft_envs[1],
                policy_c2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

soft_model_c50 = RewardConstrainedPOMO(soft_envs[2],
                policy_c5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

hard_model_c10 = RewardConstrainedPOMO(hard_envs[0],
                policy_c1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

hard_model_c20 = RewardConstrainedPOMO(hard_envs[1],
                policy_c2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

hard_model_c50 = RewardConstrainedPOMO(hard_envs[2],
                policy_c5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


In [6]:
scale = [10, 20, 50, 100]

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c1.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C10"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c10)

hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c1.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_RCPOMO_C10"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c10)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 200/200 [00:25<00:00,  7.96it/s, v_num=552, train/reward=-6.69, train/loss=-8.60, val/reward=-3.88]Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:25<00:00,  7.79it/s, v_num=552, train/reward=-3.90, train/loss=-0.351, val/reward=-3.79]Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:26<00:00,  7.46it/s, v_num=552, train/reward=-3.80, train/loss=-0.297, val/reward=-3.74]Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:26<00:00,  7.47it/s, v_num=552, train/reward=-3.92, train/loss=-0.541, val/reward=-3.73] Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:27<00:00,  7.26it/s, v_num=552, train/reward=-3.92, train/loss=-0.541, val/reward=-3.73]

  epoch_data[f"C{s}_mean_reward"] = -rewards_trained[i].mean()
  ret = ret.dtype.type(ret / rcount)


Epoch 4: 100%|██████████| 200/200 [00:27<00:00,  7.33it/s, v_num=552, train/reward=-3.77, train/loss=-0.218, val/reward=-3.69]Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:26<00:00,  7.54it/s, v_num=552, train/reward=-3.69, train/loss=-0.214, val/reward=-3.64]Callback is finished
Epoch 6: 100%|██████████| 200/200 [00:26<00:00,  7.62it/s, v_num=552, train/reward=-3.67, train/loss=-0.203, val/reward=-3.66]Callback is finished
Epoch 7: 100%|██████████| 200/200 [00:26<00:00,  7.67it/s, v_num=552, train/reward=-3.67, train/loss=-0.193, val/reward=-3.66]Callback is finished
Epoch 8: 100%|██████████| 200/200 [00:25<00:00,  7.89it/s, v_num=552, train/reward=-3.70, train/loss=-0.193, val/reward=-3.64] Callback is finished
Epoch 9: 100%|██████████| 200/200 [00:27<00:00,  7.34it/s, v_num=552, train/reward=-3.67, train/loss=-0.176, val/reward=-3.65]Callback is finished
Epoch 10: 100%|██████████| 200/200 [00:26<00:00,  7.51it/s, v_num=552, train/reward=-3.64, train/loss=-0.153, val/rew

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:26<00:00,  7.58it/s, v_num=552, train/reward=-3.58, train/loss=-0.121, val/reward=-3.55]

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs





val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:22<00:00,  8.73it/s, v_num=553, train/reward=-11.0, train/loss=-1.13, val/reward=-5.14]Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:21<00:00,  9.15it/s, v_num=553, train/reward=-15.8, train/loss=-1.08, val/reward=-5.23]  Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:21<00:00,  9.11it/s, v_num=553, train/reward=-6.95, train/loss=-0.172, val/reward=-5.08] Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:21<00:00,  9.30it/s, v_num=553, train/reward=-7.62, train/loss=-0.072, val/reward=-5.09]  Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:22<00:00,  8.78it/s, v_num=553, train/reward=-8.33, train/loss=-0.517, val/reward=-5.02] Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:22<00:00,  8.86it/s, v_num=553, train/reward=-12.4, train/loss=-0.53, val/reward=-4.97]  Callback is finished
Epoch 6: 100%|██████████| 200/200 [00:21<00:00,  9.37it/s, v_num=553, train/reward=-8.83, train/loss=-0.513, val/

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:21<00:00,  9.40it/s, v_num=553, train/reward=-6.65, train/loss=-0.239, val/reward=-4.99]


In [7]:
policy_c1 = policy_c1.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c1, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


RCPOMO: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.827217
Scale: 20 | FeasibleCounts: 95 | Mean Trained Test Cost: 7.735532
Scale: 50 | FeasibleCounts: 95 | Mean Trained Test Cost: 16.282394
Scale: 100 | FeasibleCounts: 99 | Mean Trained Test Cost: 31.068134


In [8]:
del soft_trainer_C_STEP, hard_trainer_C_STEP
del rewards_c_trained, num_c_valid
torch.cuda.empty_cache()

In [9]:

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c2.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C20"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c20)

hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c2.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_RCPOMO_C20"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c20)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:44<00:00,  4.54it/s, v_num=554, train/reward=-6.57, train/loss=-2.85, val/reward=-5.91]Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:41<00:00,  4.81it/s, v_num=554, train/reward=-8.30, train/loss=-9.83, val/reward=-6.16]Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:41<00:00,  4.79it/s, v_num=554, train/reward=-6.90, train/loss=-4.31, val/reward=-6.15] Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:42<00:00,  4.70it/s, v_num=554, train/reward=-6.40, train/loss=-1.14, val/reward=-5.97] Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:40<00:00,  5.00it/s, v_num=554, train/reward=-6.15, train/loss=-1.38, val/reward=-5.83] Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:41<00:00,  4.87it/s, v_num=554, train/reward=-6.01, train/loss=-0.857, val/reward=-5.79]Callback is finished
Epoch 6: 100%|██████████| 200/200 [00:40<00:00,  4.93it/s, v_num=554, train/reward=-6.02, train/loss=-0.686, val/reward=

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:39<00:00,  5.02it/s, v_num=554, train/reward=-5.69, train/loss=-2.37, val/reward=-5.44]

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores





HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [00:35<00:00,  5.59it/s, v_num=555, train/reward=-88.8, train/loss=-30.3, val/reward=-8.03]Callback is finished
Epoch 1: 100%|██████████| 200/200 [00:36<00:00,  5.47it/s, v_num=555, train/reward=-75.7, train/loss=4.120, val/reward=-8.21] Callback is finished
Epoch 2: 100%|██████████| 200/200 [00:34<00:00,  5.84it/s, v_num=555, train/reward=-63.7, train/loss=-10.1, val/reward=-8.24]  Callback is finished
Epoch 3: 100%|██████████| 200/200 [00:35<00:00,  5.63it/s, v_num=555, train/reward=-60.9, train/loss=-4.87, val/reward=-8.15] Callback is finished
Epoch 4: 100%|██████████| 200/200 [00:35<00:00,  5.60it/s, v_num=555, train/reward=-60.8, train/loss=-4.06, val/reward=-8.02] Callback is finished
Epoch 5: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s, v_num=555, train/reward=-60.5, train/loss=5.070, val/reward=-8.08] Callback is finished
Epoch 6: 100%|██████████| 200/200 [00:35<00:00,  5.56it/s, v_num=555, train/reward=-48.5, train/loss=-7.56, val/reward

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:37<00:00,  5.36it/s, v_num=555, train/reward=-49.7, train/loss=-8.21, val/reward=-8.01]


In [10]:
policy_c2 = policy_c2.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c2, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=20, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


RCPOMO: Trained with Environment of C=20, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.863012
Scale: 20 | FeasibleCounts: 100 | Mean Trained Test Cost: 7.804768
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 16.240088
Scale: 100 | FeasibleCounts: 97 | Mean Trained Test Cost: 32.686348


In [11]:
del soft_trainer_C_STEP, hard_trainer_C_STEP
del rewards_c_trained, num_c_valid
torch.cuda.empty_cache()

In [12]:

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c5.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C50"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c50)

hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c5.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_RCPOMO_C50"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c50)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [02:00<00:00,  1.66it/s, v_num=556, train/reward=-14.1, train/loss=-6.77, val/reward=-12.5]Callback is finished
Epoch 1: 100%|██████████| 200/200 [01:53<00:00,  1.76it/s, v_num=556, train/reward=-13.0, train/loss=-3.17, val/reward=-12.1]Callback is finished
Epoch 2: 100%|██████████| 200/200 [01:55<00:00,  1.73it/s, v_num=556, train/reward=-12.6, train/loss=-2.56, val/reward=-11.9]Callback is finished
Epoch 3: 100%|██████████| 200/200 [01:55<00:00,  1.74it/s, v_num=556, train/reward=-12.0, train/loss=-2.55, val/reward=-11.4]Callback is finished
Epoch 4: 100%|██████████| 200/200 [01:54<00:00,  1.75it/s, v_num=556, train/reward=-11.9, train/loss=-2.62, val/reward=-11.4]Callback is finished
Epoch 5: 100%|██████████| 200/200 [01:55<00:00,  1.73it/s, v_num=556, train/reward=-11.8, train/loss=-2.07, val/reward=-11.3]Callback is finished
Epoch 6: 100%|██████████| 200/200 [01:54<00:00,  1.75it/s, v_num=556, train/reward=-11.5, train/loss=-2.41, val/reward=-11.0

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [01:55<00:00,  1.73it/s, v_num=556, train/reward=-10.7, train/loss=-1.91, val/reward=-10.5]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [01:35<00:00,  2.10it/s, v_num=557, train/reward=-27.6, train/loss=-15.7, val/reward=-16.0]Callback is finished
Epoch 1: 100%|██████████| 200/200 [01:34<00:00,  2.13it/s, v_num=557, train/reward=-25.8, train/loss=-8.98, val/reward=-15.8] Callback is finished
Epoch 2: 100%|██████████| 200/200 [01:33<00:00,  2.13it/s, v_num=557, train/reward=-24.0, train/loss=0.395, val/reward=-15.6] Callback is finished
Epoch 3: 100%|██████████| 200/200 [01:32<00:00,  2.15it/s, v_num=557, train/reward=-28.2, train/loss=-9.67, val/reward=-15.2] Callback is finished
Epoch 4: 100%|██████████| 200/200 [01:32<00:00,  2.17it/s, v_num=557, train/reward=-24.7, train/loss=-2.06, val/reward=-15.7] Callback is finished
Epoch 5: 100%|██████████| 200/200 [01:31<00:00,  2.19it/s, v_num=557, train/reward=-25.1, train/loss=-8.01, val/reward=-16.2] Callback is finished
Epoch 6: 100%|██████████| 200/200 [01:33<00:00,  2.15it/s, v_num=557, train/reward=-23.0, train/loss=-1.40, val/reward=

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [01:32<00:00,  2.16it/s, v_num=557, train/reward=-22.7, train/loss=-6.37, val/reward=-15.4]


In [13]:
policy_c5 = policy_c5.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c5, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


RCPOMO: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.846461
Scale: 20 | FeasibleCounts: 98 | Mean Trained Test Cost: 7.474066
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 14.989614
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 26.575089


In [14]:
del soft_trainer_C_STEP, hard_trainer_C_STEP
del rewards_c_trained, num_c_valid
torch.cuda.empty_cache()

In [15]:
MAX_EPOCH = 25
BATCH_SIZE_100 = 128
TRAIN_DATA_SIZE_100 = BATCH_SIZE_100 * 200
VAL_DATA_SIZE_100 = BATCH_SIZE_100 * 50

# RCPOMO
policy_c100 = AttentionModelPolicy(env_name=soft_envs[3].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

soft_model_c100 = RewardConstrainedPOMO(soft_envs[3],
                policy_c100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

hard_model_c100 = RewardConstrainedPOMO(hard_envs[3],
                policy_c100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

In [16]:

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c100.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C100"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c100)

hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c100.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_RCPOMO_C100"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c100)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [03:33<00:00,  0.93it/s, v_num=558, train/reward=-27.3, train/loss=-10.8, val/reward=-24.9]Callback is finished
Epoch 1: 100%|██████████| 200/200 [03:15<00:00,  1.02it/s, v_num=558, train/reward=-25.5, train/loss=-8.76, val/reward=-22.5]Callback is finished
Epoch 2: 100%|██████████| 200/200 [03:20<00:00,  1.00it/s, v_num=558, train/reward=-23.0, train/loss=-5.17, val/reward=-22.3]Callback is finished
Epoch 3: 100%|██████████| 200/200 [03:19<00:00,  1.00it/s, v_num=558, train/reward=-22.7, train/loss=-4.56, val/reward=-21.8]Callback is finished
Epoch 4: 100%|██████████| 200/200 [03:19<00:00,  1.00it/s, v_num=558, train/reward=-22.9, train/loss=-4.02, val/reward=-21.6]Callback is finished
Epoch 5: 100%|██████████| 200/200 [03:18<00:00,  1.01it/s, v_num=558, train/reward=-21.9, train/loss=-4.17, val/reward=-21.3]Callback is finished
Epoch 6: 100%|██████████| 200/200 [03:16<00:00,  1.02it/s, v_num=558, train/reward=-21.4, train/loss=-3.20, val/reward=-20.5

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [03:14<00:00,  1.03it/s, v_num=558, train/reward=-20.9, train/loss=-4.48, val/reward=-19.9]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 200/200 [02:46<00:00,  1.20it/s, v_num=559, train/reward=-28.9, train/loss=-6.45, val/reward=-26.5]Callback is finished
Epoch 1: 100%|██████████| 200/200 [02:38<00:00,  1.26it/s, v_num=559, train/reward=-25.8, train/loss=-2.28, val/reward=-24.7] Callback is finished
Epoch 2: 100%|██████████| 200/200 [02:37<00:00,  1.27it/s, v_num=559, train/reward=-24.8, train/loss=-2.05, val/reward=-23.9] Callback is finished
Epoch 3: 100%|██████████| 200/200 [02:36<00:00,  1.27it/s, v_num=559, train/reward=-24.4, train/loss=-2.34, val/reward=-24.0] Callback is finished
Epoch 4: 100%|██████████| 200/200 [02:40<00:00,  1.25it/s, v_num=559, train/reward=-24.6, train/loss=-3.07, val/reward=-23.3] Callback is finished
Epoch 5: 100%|██████████| 200/200 [02:37<00:00,  1.27it/s, v_num=559, train/reward=-24.3, train/loss=-2.01, val/reward=-23.3] Callback is finished
Epoch 6: 100%|██████████| 200/200 [02:39<00:00,  1.25it/s, v_num=559, train/reward=-23.5, train/loss=-3.10, val/reward=

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [02:40<00:00,  1.24it/s, v_num=559, train/reward=-21.9, train/loss=-0.458, val/reward=-21.3]


In [17]:
policy_c100 = policy_c100.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c100, td_tests, hard_envs)
print("\nRCPOMO: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


RCPOMO: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 94 | Mean Trained Test Cost: 6.650102
Scale: 20 | FeasibleCounts: 78 | Mean Trained Test Cost: 6.828812
Scale: 50 | FeasibleCounts: 93 | Mean Trained Test Cost: 12.858833
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 20.862005
