In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE, SymNCO, PPO, POMO, RewardConstrainedPOMO
from rl4co.utils.trainer import RL4COTrainer
from rl4co.utils.callbacks.reward_check import RewardLoggingCallback, get_reward_and_check

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import rl4co
print(rl4co.__file__)

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\rl4co\__init__.py


In [3]:
import sys
# sys.path.remove(r"c:\users\hyosi\onedrive\ut\2024 fall\mie1666\project\code\rl4evrptw\rl4co")

for path in sys.path:
    print(path)

c:\Users\hyosi\anaconda3\envs\rl4co\python311.zip
c:\Users\hyosi\anaconda3\envs\rl4co\DLLs
c:\Users\hyosi\anaconda3\envs\rl4co\Lib
c:\Users\hyosi\anaconda3\envs\rl4co

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\win32\lib
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\Pythonwin
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\setuptools\_vendor


In [4]:
device = torch.device("cuda")
def enforce_reproducibility(seed):
    import random
    import os 
    os.environ['PYTHONHASHSEED'] = str(seed)
    # NVIDIA's CUDA Basic Linear Algebra Subroutines library
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
vehicle_capacity = 1.25

# [num_loc, num_station, num_ev]
settings =[[10, 3, 3], [20, 3, 3], [50, 6, 6], [100, 12,12]]
hard_envs = []
td_tests = []   # Hard env setting for test (cf. get_action_mask() is different)
for num_loc, num_station, num_ev in settings:
    enforce_reproducibility(0)
    env = EVRPTWEnv(generator_params={'num_loc': num_loc, 
                                        'num_station': num_station,
                                        'vehicle_limit': num_ev,
                                        'vehicle_speed': 5,
                                        'vehicle_capacity': vehicle_capacity,
                                        'max_time': 1,
                                        'horizon': 1,
                                        'fuel_consumption_rate': 0.25,
                                        'inverse_recharge_rate': 0.25})
    hard_envs.append(env)
    td_init = env.reset(batch_size=[100]).to(device)
    td_tests.append(td_init)

soft_envs = []
for num_loc, num_station, num_ev in settings:
    enforce_reproducibility(0)
    env = EVRPTWEnv(generator_params={'num_loc': num_loc, 
                                        'num_station': num_station,
                                        'vehicle_limit': num_ev,
                                        'vehicle_speed': 5,
                                        'vehicle_capacity': vehicle_capacity,
                                        'max_time': 1,
                                        'horizon': 1,
                                        'fuel_consumption_rate': 0.25,
                                        'inverse_recharge_rate': 0.25})
    env.soft = True ## Soft setting
    soft_envs.append(env)




In [5]:
MAX_EPOCH = 25
BATCH_SIZE = 512
TRAIN_DATA_SIZE = BATCH_SIZE * 200
VAL_DATA_SIZE = BATCH_SIZE * 50
# MAX_EPOCH = 2
# BATCH_SIZE = 10
# TRAIN_DATA_SIZE = BATCH_SIZE * 1
# VAL_DATA_SIZE = BATCH_SIZE * 1

# RCPOMO
policy_c1 = AttentionModelPolicy(env_name=soft_envs[0].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c2 = AttentionModelPolicy(env_name=soft_envs[1].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

policy_c5 = AttentionModelPolicy(env_name=soft_envs[2].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

soft_model_c10 = RewardConstrainedPOMO(soft_envs[0],
                policy_c1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

soft_model_c20 = RewardConstrainedPOMO(soft_envs[1],
                policy_c2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

soft_model_c50 = RewardConstrainedPOMO(soft_envs[2],
                policy_c5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

hard_model_c10 = POMO(hard_envs[0],
                policy_c1,
                 # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                  "weight_decay": 1e-6})

hard_model_c20 = POMO(hard_envs[1],
                policy_c2,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

hard_model_c50 = POMO(hard_envs[2],
                policy_c5,
                # baseline="rollout",
                batch_size=BATCH_SIZE,
                train_data_size=TRAIN_DATA_SIZE,
                val_data_size=VAL_DATA_SIZE,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})


c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


In [6]:
scale = [10, 20, 50, 100]

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c1.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C10"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c10)

# POMO
hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c1.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_POMO_C10"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c10)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Sanity Checking:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 3: 100%|██████████| 200/200 [00:51<00:00,  3.86it/s, v_num=569, train/reward=-4.10, train/loss=-1.50, val/reward=-3.76]  

  epoch_data[f"C{s}_mean_reward"] = -rewards_trained[i].mean()
  ret = ret.dtype.type(ret / rcount)


Epoch 24: 100%|██████████| 200/200 [00:22<00:00,  8.97it/s, v_num=569, train/reward=-3.58, train/loss=-0.141, val/reward=-3.56] 

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:22<00:00,  8.92it/s, v_num=569, train/reward=-3.58, train/loss=-0.141, val/reward=-3.56]

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs





val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 200/200 [00:20<00:00,  9.68it/s, v_num=570, train/reward=-6.67, train/loss=-0.294, val/reward=-5.04]  

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:20<00:00,  9.63it/s, v_num=570, train/reward=-6.67, train/loss=-0.294, val/reward=-5.04]


In [7]:
policy_c1 = policy_c1.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c1, td_tests, hard_envs)
print("\n25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=10, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=10, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.858532
Scale: 20 | FeasibleCounts: 98 | Mean Trained Test Cost: 7.898353
Scale: 50 | FeasibleCounts: 98 | Mean Trained Test Cost: 16.292616
Scale: 100 | FeasibleCounts: 98 | Mean Trained Test Cost: 31.698280


In [8]:
del soft_trainer_C_STEP, hard_trainer_C_STEP
del rewards_c_trained, num_c_valid
torch.cuda.empty_cache()

In [9]:

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c2.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C20"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c20)

hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c2.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_POMO_C20"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c20)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 200/200 [00:39<00:00,  5.02it/s, v_num=571, train/reward=-5.54, train/loss=-0.461, val/reward=-5.46]

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:39<00:00,  5.00it/s, v_num=571, train/reward=-5.54, train/loss=-0.461, val/reward=-5.46]

Using 16bit Automatic Mixed Precision (AMP)





GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s, v_num=572, train/reward=-51.5, train/loss=-4.11, val/reward=-7.91]    

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [00:35<00:00,  5.59it/s, v_num=572, train/reward=-51.5, train/loss=-4.11, val/reward=-7.91]


In [10]:
policy_c2 = policy_c2.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c2, td_tests, hard_envs)
print("\n25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=20, S=3, EV=3")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=20, S=3, EV=3
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.752318
Scale: 20 | FeasibleCounts: 99 | Mean Trained Test Cost: 7.654899
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 15.985867
Scale: 100 | FeasibleCounts: 98 | Mean Trained Test Cost: 31.536602


In [11]:
del soft_trainer_C_STEP, hard_trainer_C_STEP
del rewards_c_trained, num_c_valid
torch.cuda.empty_cache()

In [12]:

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c5.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C50"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c50)

hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c5.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_POMO_C50"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c50)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 200/200 [01:55<00:00,  1.74it/s, v_num=573, train/reward=-10.8, train/loss=-1.25, val/reward=-10.6]     

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [01:55<00:00,  1.73it/s, v_num=573, train/reward=-10.8, train/loss=-1.25, val/reward=-10.6]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 200/200 [01:34<00:00,  2.11it/s, v_num=574, train/reward=-22.5, train/loss=-4.41, val/reward=-15.2]

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [01:34<00:00,  2.11it/s, v_num=574, train/reward=-22.5, train/loss=-4.41, val/reward=-15.2]


In [13]:
policy_c5 = policy_c5.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c5, td_tests, hard_envs)
print("\n25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=50, S=6, EV=6")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=50, S=6, EV=6
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 5.177136
Scale: 20 | FeasibleCounts: 99 | Mean Trained Test Cost: 7.793137
Scale: 50 | FeasibleCounts: 100 | Mean Trained Test Cost: 15.010887
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 26.946321


In [14]:
del soft_trainer_C_STEP, hard_trainer_C_STEP
del rewards_c_trained, num_c_valid
torch.cuda.empty_cache()

In [15]:
MAX_EPOCH = 25
BATCH_SIZE_100 = 128
TRAIN_DATA_SIZE_100 = BATCH_SIZE_100 * 200
VAL_DATA_SIZE_100 = BATCH_SIZE_100 * 50

# RCPOMO
policy_c100 = AttentionModelPolicy(env_name=soft_envs[3].name,
                              embed_dim=256,
                              num_encoder_layers=6,
                              num_heads=8,)

soft_model_c100 = RewardConstrainedPOMO(soft_envs[3],
                policy_c100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

# POMO
hard_model_c100 = POMO(hard_envs[3],
                policy_c100,
                # baseline="rollout",
                batch_size=BATCH_SIZE_100,
                train_data_size=TRAIN_DATA_SIZE_100,
                val_data_size=VAL_DATA_SIZE_100,
                optimizer_kwargs={"lr": 1e-4, 
                                "weight_decay": 1e-6})

In [16]:

# RCPOMO
soft_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c100.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_SOFT_RCPOMO_C100"
        )
    ]
)
soft_trainer_C_STEP.fit(soft_model_c100)

# POMO
hard_trainer_C_STEP = RL4COTrainer(
    max_epochs=MAX_EPOCH,
    accelerator="gpu",
    devices=1,
    logger=None,
    callbacks=[
        RewardLoggingCallback(
            policy=policy_c100.to(device),
            test_data=td_tests,
            env_scale=hard_envs,
            scale = scale,
            log_dir="logs",  # Need to set the logs folder or else
            file_name="25_HARD_POMO_C100"
        )
    ]
)
hard_trainer_C_STEP.fit(hard_model_c100)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 200/200 [03:13<00:00,  1.03it/s, v_num=575, train/reward=-19.5, train/loss=-2.86, val/reward=-18.9] 

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [03:14<00:00,  1.03it/s, v_num=575, train/reward=-19.5, train/loss=-2.86, val/reward=-18.9]


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 3.6 M  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.241    Total estimated model params size (MB)
126       Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 200/200 [02:34<00:00,  1.30it/s, v_num=576, train/reward=-22.3, train/loss=-2.44, val/reward=-21.5]   

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 200/200 [02:34<00:00,  1.29it/s, v_num=576, train/reward=-22.3, train/loss=-2.44, val/reward=-21.5]


In [17]:
policy_c100 = policy_c100.to(device)
rewards_c_trained, num_c_valid = get_reward_and_check(policy_c100, td_tests, hard_envs)
print("\n25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=100, S=12, EV=12")
for i, s in enumerate(scale):
    print(f"Scale: {s} | FeasibleCounts: {num_c_valid[i]} | Mean Trained Test Cost: {-rewards_c_trained[i].mean():3f}")


25Soft_RCPOMO + 25Hard_POMO: Trained with Environment of C=100, S=12, EV=12
Scale: 10 | FeasibleCounts: 100 | Mean Trained Test Cost: 4.955643
Scale: 20 | FeasibleCounts: 79 | Mean Trained Test Cost: 7.069251
Scale: 50 | FeasibleCounts: 92 | Mean Trained Test Cost: 13.122625
Scale: 100 | FeasibleCounts: 100 | Mean Trained Test Cost: 21.177683
