In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE, SymNCO, PPO, POMO, RewardConstrainedPOMO
from rl4co.utils.trainer import RL4COTrainer
from rl4co.utils.callbacks.reward_check import RewardLoggingCallback, get_reward_and_check

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import rl4co
print(rl4co.__file__)

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\rl4co\__init__.py


In [3]:
import sys
# sys.path.remove(r"c:\users\hyosi\onedrive\ut\2024 fall\mie1666\project\code\rl4evrptw\rl4co")

for path in sys.path:
    print(path)

c:\Users\hyosi\anaconda3\envs\rl4co\python311.zip
c:\Users\hyosi\anaconda3\envs\rl4co\DLLs
c:\Users\hyosi\anaconda3\envs\rl4co\Lib
c:\Users\hyosi\anaconda3\envs\rl4co

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32\lib
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\Pythonwin
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\setuptools\_vendor


In [6]:
device = torch.device("cuda")
def enforce_reproducibility(seed):
    import random
    import os 
    os.environ['PYTHONHASHSEED'] = str(seed)
    # NVIDIA's CUDA Basic Linear Algebra Subroutines library
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
vehicle_capacity = 1.25

enforce_reproducibility(0)
env_10 = EVRPTWEnv(generator_params={'num_loc': 10, 
                                    'num_station': 3,
                                    'vehicle_limit': 3,
                                    'vehicle_speed': 5,
                                    'vehicle_capacity': vehicle_capacity,
                                    'max_time': 1,
                                    'horizon': 1,
                                    'fuel_consumption_rate': 0.25,
                                    'inverse_recharge_rate': 0.25})
td_10_TEST = env_10.reset(batch_size=[100]).to(device)


enforce_reproducibility(0)
env_20 = EVRPTWEnv(generator_params={'num_loc': 20, 
                                     'num_station': 3,
                                     'vehicle_limit': 3,
                                     'vehicle_speed': 5,
                                     'vehicle_capacity': vehicle_capacity,
                                     'max_time': 1,
                                     'horizon': 1,
                                     'fuel_consumption_rate': 0.25,
                                     'inverse_recharge_rate': 0.25})
td_20_TEST = env_20.reset(batch_size=[100]).to(device)


enforce_reproducibility(0)
env_50 = EVRPTWEnv(generator_params={'num_loc': 50, 
                                     'num_station': 6,
                                     'vehicle_limit': 6,
                                     'vehicle_speed': 5,
                                     'vehicle_capacity': vehicle_capacity,
                                     'max_time': 1,
                                     'horizon': 1,
                                     'fuel_consumption_rate': 0.25,
                                     'inverse_recharge_rate': 0.25})
td_50_TEST = env_50.reset(batch_size=[100]).to(device)


enforce_reproducibility(0)
env_100 = EVRPTWEnv(generator_params={'num_loc': 100,
                                    'num_station': 12,
                                    'vehicle_limit': 12,
                                    'vehicle_speed': 5,
                                    'vehicle_capacity': vehicle_capacity,
                                    'max_time': 1,
                                    'horizon': 1,
                                    'fuel_consumption_rate': 0.25,
                                    'inverse_recharge_rate': 0.25})
td_100_TEST = env_100.reset(batch_size=[100]).to(device)


In [5]:
MAX_EPOCH = 50
BATCH_SIZE = 512
TRAIN_DATA_SIZE = BATCH_SIZE * 200
VAL_DATA_SIZE = BATCH_SIZE * 50
# MAX_EPOCH = 2
# BATCH_SIZE = 512
# TRAIN_DATA_SIZE = BATCH_SIZE * 50
# VAL_DATA_SIZE = BATCH_SIZE * 10

# POMO
policy1 = AttentionModelPolicy(env_name=env_10.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy2 = AttentionModelPolicy(env_name=env_20.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy5 = AttentionModelPolicy(env_name=env_50.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_10 = POMO(env_10,
                policy1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

model_20 = POMO(env_20,
                policy2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

model_50 = POMO(env_50,
                policy5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


# RCPOMO
policy_c1 = AttentionModelPolicy(env_name=env_10.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c2 = AttentionModelPolicy(env_name=env_20.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c5 = AttentionModelPolicy(env_name=env_50.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_c10 = RewardConstrainedPOMO(env_10,
                policy_c1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

model_c20 = RewardConstrainedPOMO(env_20,
                policy_c2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

model_c50 = RewardConstrainedPOMO(env_50,
                policy_c5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

# RINFORCE
policy_r1 = AttentionModelPolicy(env_name=env_10.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_r2 = AttentionModelPolicy(env_name=env_20.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_r5 = AttentionModelPolicy(env_name=env_50.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_r10 = REINFORCE(env_10,
                policy_r1,
                 baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

model_r20 = REINFORCE(env_20,
                policy_r2,
                baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

model_r50 = REINFORCE(env_50,
                policy_r5,
                baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


In [6]:
td_scale_test = [td_10_TEST, td_20_TEST, td_50_TEST, td_100_TEST]
env_scale = [env_10, env_20, env_50, env_100]
scale = [10, 20, 50, 100]

# POMO
trainer_STEP1 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy1.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_POMO_C10"
        )
    ]
)
trainer_STEP1.fit(model_10)

# RCPOMO
trainer_C_STEP1 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c1.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_RCPOMO_C10"
        )
    ]
)
trainer_C_STEP1.fit(model_c10)

# REINFORCE
trainer_R_STEP1 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r1.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_REINFORCE_C10"
        )
    ]
)
trainer_R_STEP1.fit(model_r10)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Sanity Checking:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 49: 100%|██████████| 200/200 [00:19<00:00, 10.07it/s, v_num=513, train/reward=-10.3, train/loss=0.866, val/reward=-4.99]    

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:19<00:00, 10.00it/s, v_num=513, train/reward=-10.3, train/loss=0.866, val/reward=-4.99]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [00:19<00:00, 10.26it/s, v_num=514, train/reward=-10.8, train/loss=0.0159, val/reward=-4.88]   

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:19<00:00, 10.19it/s, v_num=514, train/reward=-10.8, train/loss=0.0159, val/reward=-4.88]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [00:35<00:00,  5.59it/s, v_num=515, train/reward=-4.14, train/loss=0.0199, val/reward=-4.14]   

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:50<00:00,  4.00it/s, v_num=515, train/reward=-4.14, train/loss=0.0199, val/reward=-4.14]


In [7]:
policy1 = policy1.to(device)
rewards_trained, num_valid = get_reward_and_check(policy1, td_scale_test, env_scale)
# print(rewards_trained)
print("POMO: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c1 = policy_c1.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c1, td_scale_test, env_scale)
print("\nRCPOMO: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r1 = policy_r1.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r1, td_scale_test, env_scale)
print("\nREINFORCE: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.856230
Scale: 20 | FeasibleCounts: 97 | Mean Trained Test Cost: 8.015352
Scale: 50 | FeasibleCounts: 95 | Mean Trained Test Cost: 16.581690
Scale: 100 | FeasibleCounts: 99 | Mean Trained Test Cost: 31.866270

RCPOMO: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.668255
Scale: 20 | FeasibleCounts: 98 | Mean Trained Test Cost: 7.802932
Scale: 50 | FeasibleCounts: 93 | Mean Trained Test Cost: 16.237083
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 30.556526

REINFORCE: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.255553
Scale: 20 | FeasibleCounts: 90 | Mean Trained Test Cost: 6.736508
Scale: 50 | FeasibleCounts: 79 | Mean Trained Test Cost: 13.739327
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 24.965090


In [8]:
del trainer_STEP1, trainer_C_STEP1, trainer_R_STEP1
del rewards_trained, rewards_c_trained, rewards_r_trained, num_valid, num_c_valid, num_r_valid
torch.cuda.empty_cache()

In [9]:

# POMO
trainer_STEP2 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy2.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_POMO_C20"
        )
    ]
)
trainer_STEP2.fit(model_20)

# RCPOMO
trainer_C_STEP2 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c2.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_RCPOMO_C20"
        )
    ]
)
trainer_C_STEP2.fit(model_c20)

# REINFORCE
trainer_R_STEP2 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r2.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_REINFORCE_C20"
        )
    ]
)
trainer_R_STEP2.fit(model_r20)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s, v_num=516, train/reward=-39.6, train/loss=-20.9, val/reward=-8.20]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [00:35<00:00,  5.69it/s, v_num=516, train/reward=-39.6, train/loss=-20.9, val/reward=-8.20]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [01:33<00:00,  2.13it/s, v_num=517, train/reward=-51.2, train/loss=-22.1, val/reward=-8.50] 

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [01:34<00:00,  2.13it/s, v_num=517, train/reward=-51.2, train/loss=-22.1, val/reward=-8.50]

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True





TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [01:10<00:00,  2.84it/s, v_num=518, train/reward=-12.2, train/loss=16.00, val/reward=-8.18]  

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [01:26<00:00,  2.32it/s, v_num=518, train/reward=-12.2, train/loss=16.00, val/reward=-8.18]


In [10]:
policy2 = policy2.to(device)
rewards_trained, num_valid = get_reward_and_check(policy2, td_scale_test, env_scale)
# print(rewards_trained)
print("POMO: Trained with Environment of C=20, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c2 = policy_c2.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c2, td_scale_test, env_scale)
print("\nRCPOMO: Trained with Environment of C=20, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r2 = policy_r2.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r2, td_scale_test, env_scale)
print("\nREINFORCE: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=20, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 5.261318
Scale: 20 | FeasibleCounts: 100 | Mean Trained Test Cost: 8.114063
Scale: 50 | FeasibleCounts: 99 | Mean Trained Test Cost: 16.827742
Scale: 100 | FeasibleCounts: 99 | Mean Trained Test Cost: 33.060474

RCPOMO: Trained with Environment of C=20, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 5.336650
Scale: 20 | FeasibleCounts: 99 | Mean Trained Test Cost: 8.125189
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 16.944166
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 33.477730

REINFORCE: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 5.012807
Scale: 20 | FeasibleCounts: 99 | Mean Trained Test Cost: 8.085536
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 16.917492
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 31.081825


In [11]:
del trainer_STEP2, trainer_C_STEP2, trainer_R_STEP2
del rewards_trained, rewards_c_trained, rewards_r_trained, num_valid, num_c_valid, num_r_valid
torch.cuda.empty_cache()

In [12]:

# POMO
trainer_STEP5 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy5.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_POMO_C50"
        )
    ]
)
trainer_STEP5.fit(model_50)

# RCPOMO
trainer_C_STEP5 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c5.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_RCPOMO_C50"
        )
    ]
)
trainer_C_STEP5.fit(model_c50)

# REINFORCE
trainer_R_STEP5 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r5.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_REINFORCE_C50"
        )
    ]
)
trainer_R_STEP5.fit(model_r50)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [01:31<00:00,  2.18it/s, v_num=519, train/reward=-20.8, train/loss=-6.02, val/reward=-15.4] 

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [01:31<00:00,  2.18it/s, v_num=519, train/reward=-20.8, train/loss=-6.02, val/reward=-15.4]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [01:33<00:00,  2.14it/s, v_num=520, train/reward=-22.8, train/loss=-6.94, val/reward=-15.3] 

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [01:33<00:00,  2.14it/s, v_num=520, train/reward=-22.8, train/loss=-6.94, val/reward=-15.3]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [00:55<00:00,  3.59it/s, v_num=521, train/reward=-21.3, train/loss=-145., val/reward=-17.0]   

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [01:04<00:00,  3.11it/s, v_num=521, train/reward=-21.3, train/loss=-145., val/reward=-17.0]


In [13]:
policy5 = policy5.to(device)
rewards_trained, num_valid = get_reward_and_check(policy5, td_scale_test, env_scale)
# print(rewards_trained)
print("POMO: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c5 = policy_c5.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c5, td_scale_test, env_scale)
print("\nRCPOMO: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r5 = policy_r5.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r5, td_scale_test, env_scale)
print("\nREINFORCE: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 5.788095
Scale: 20 | FeasibleCounts: 100 | Mean Trained Test Cost: 8.282652
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 15.128916
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 26.151745

RCPOMO: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 5.650996
Scale: 20 | FeasibleCounts: 99 | Mean Trained Test Cost: 7.790233
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 15.168052
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 26.645964

REINFORCE: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 5.091000
Scale: 20 | FeasibleCounts: 97 | Mean Trained Test Cost: 8.217063
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 16.950056
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 30.425636


In [10]:
td_scale_test = [td_10_TEST, td_20_TEST, td_50_TEST, td_100_TEST]
env_scale = [env_10, env_20, env_50, env_100]
scale = [10, 20, 50, 100]

In [11]:
MAX_EPOCH = 50
BATCH_SIZE_100 = 128
TRAIN_DATA_SIZE_100 = BATCH_SIZE_100 * 200
VAL_DATA_SIZE_100 = BATCH_SIZE_100 * 50

# POMO
policy100 = AttentionModelPolicy(env_name=env_100.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_100 = POMO(env_100,
                policy100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


# RCPOMO
policy_c100 = AttentionModelPolicy(env_name=env_100.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_c100 = RewardConstrainedPOMO(env_100,
                policy_c100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


# REINFORCE
policy_r100 = AttentionModelPolicy(env_name=env_100.name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

model_r100 = REINFORCE(env_100,
                policy_r100,
                baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

In [12]:
# POMO
trainer_STEP100 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy100.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_POMO_C100"
        )
    ]
)
trainer_STEP100.fit(model_100)

# RCPOMO
trainer_C_STEP100 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c100.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_RCPOMO_C100"
        )
    ]
)
trainer_C_STEP100.fit(model_c100)

# REINFORCE
trainer_R_STEP100 = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_r100.to(device),
            test_data=td_scale_test,
            env_scale=env_scale,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="HARD_REINFORCE_C100"
        )
    ]
)
trainer_R_STEP100.fit(model_r100)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 49: 100%|██████████| 200/200 [02:18<00:00,  1.44it/s, v_num=526, train/reward=-21.2, train/loss=-1.02, val/reward=-20.8]   

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [02:18<00:00,  1.44it/s, v_num=526, train/reward=-21.2, train/loss=-1.02, val/reward=-20.8]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [02:19<00:00,  1.44it/s, v_num=527, train/reward=-21.6, train/loss=-1.14, val/reward=-21.0]   

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [02:19<00:00,  1.44it/s, v_num=527, train/reward=-21.6, train/loss=-1.14, val/reward=-21.0]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | WarmupBaseline       | 3.6 M  | train
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params
7.1 M     Total params
28.482    Total estimated model params size (MB)
128       Modules in train mode
124       Modules in eval mode


Epoch 49: 100%|██████████| 200/200 [02:11<00:00,  1.52it/s, v_num=528, train/reward=-25.1, train/loss=-80.5, val/reward=-23.4]     

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 200/200 [02:26<00:00,  1.36it/s, v_num=528, train/reward=-25.1, train/loss=-80.5, val/reward=-23.4]


In [13]:
policy100 = policy100.to(device)
rewards_trained, num_valid = get_reward_and_check(policy100, td_scale_test, env_scale)
# print(rewards_trained)
print("POMO: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_valid[i]} | Mean Trained Test Cost: {-rewards_trained[i].mean():3f}")

policy_c100 = policy_c100.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c100, td_scale_test, env_scale)
print("\nRCPOMO: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")
    
policy_r100 = policy_r100.to(device)
rewards_r_trained, num_r_valid = get_reward_and_check(policy_r100, td_scale_test, env_scale)
print("\nREINFORCE: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_r_valid[i]} | Mean Trained Test Cost: {-rewards_r_trained[i].mean():3f}")

POMO: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.868367
Scale: 20 | FeasibleCounts: 70 | Mean Trained Test Cost: 6.940489
Scale: 50 | FeasibleCounts: 94 | Mean Trained Test Cost: 12.453379
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 20.401237

RCPOMO: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 99 | Mean Trained Test Cost: 4.809740
Scale: 20 | FeasibleCounts: 77 | Mean Trained Test Cost: 6.937341
Scale: 50 | FeasibleCounts: 94 | Mean Trained Test Cost: 12.568416
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 20.684378

REINFORCE: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 95 | Mean Trained Test Cost: 4.912994
Scale: 20 | FeasibleCounts: 50 | Mean Trained Test Cost: 7.151945
Scale: 50 | FeasibleCounts: 66 | Mean Trained Test Cost: 13.672751
Scale: 100 | FeasibleCounts: 99 | Mean Trained Test Cost: 23.472595
