In [1]:
%load_ext autoreload
%autoreload 2

import torch

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE, SymNCO, PPO, POMO, RewardConstrainedPOMO
from rl4co.utils.trainer import RL4COTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = EVRPTWEnv(generator_params={'num_loc': 10, 
                                  'num_station': 3,
                                  'vehicle_limit': 3,
                                  'vehicle_speed': 5,
                                  'max_time': 0.9,
                                  'horizon': 1.2,
                                  'fuel_consumption_rate': 0.2,
                                  'inverse_recharge_rate': 0.2})



In [3]:
# Policy: neural network, in this case with encoder-decoder architecture
policy = AttentionModelPolicy(env_name=env.name,
                              embed_dim=128,
                              num_encoder_layers=3,
                              num_heads=8,
                              )

policy_constrained = AttentionModelPolicy(env_name=env.name,
                              embed_dim=128,
                              num_encoder_layers=3,
                              num_heads=8,
                              )

# RL Model: REINFORCE and greedy rollout baseline
# model = REINFORCE(env,
#                   policy,
#                   baseline="rollout",
#                   batch_size=512,
#                   train_data_size=500_000,
#                   val_data_size=100_000,
#                   optimizer_kwargs={"lr": 1e-4},
#                   )

# model = PPO(env,
#             policy,
#             # baseline="rollout",
#             batch_size=512,
#             train_data_size=100_000,
#             val_data_size=10_000,
#             optimizer_kwargs={"lr": 1e-4},
#             )

model = POMO(env,
            policy,
            # baseline="rollout",
            batch_size=512,
            train_data_size=100_000,
            val_data_size=10_000,
            optimizer_kwargs={"lr": 1e-4},
            )

model_constrained = RewardConstrainedPOMO(env,
            policy_constrained,
            # baseline="rollout",
            batch_size=512,
            train_data_size=100_000,
            val_data_size=10_000,
            optimizer_kwargs={"lr": 1e-4},
            )

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


In [4]:
# Greedy rollouts over untrained policy
device = torch.device("cuda")
# device = torch.device("cpu")
td_init = env.reset(batch_size=[100]).to(device)

policy = policy.to(device)
policy_constrain = policy_constrained.to(device)
out = policy(td_init.clone(), env=env, phase="test", decode_type="greedy", return_actions=True)
out_constrained = policy_constrained(td_init.clone(), env=env, phase="test", decode_type="greedy", return_actions=True)

actions_untrained = out['actions'].cpu().numpy()
rewards_untrained = out['reward'].cpu().numpy()
actions_untrained_constrained = out_constrained['actions'].cpu().numpy()
rewards_untrained_constrained = out_constrained['reward'].cpu().numpy()

for i in range(td_init.batch_size[0]):
    print(f"Problem {i+1} | Cost: {-rewards_untrained[i]:3f} | Cost_constrained: {-rewards_untrained_constrained[i]:3f}")
    # env.render(td_init[i], actions_untrained[i])
    print(actions_untrained[i])
    print(actions_untrained_constrained[i])
print('Untrained Mean Cost ', -rewards_untrained.mean(), 'Untrained Mean Cost_constrained ', -rewards_untrained_constrained.mean())

Problem 1 | Cost: 33.895203 | Cost_constrained: 34.013275
[13  8  5  6  1  0 13  9  3 10  0 13  7  4  0 13  0 13  0 13  0 13  0 13
  0 13  0 13  0 13  0 13  0 13  0 13]
[13  4  8  5  1  6  0 13  7  9  3  0 13 10  0 13  0 13  0 13  0 13  0 13
  0 13  0 13  0 13  0 13  0 13  0 13]
Problem 2 | Cost: 26.729649 | Cost_constrained: 29.204018
[11  5  2 10  6  0 11  7  1  0 11  3  0 11  8  0 11  0 11  0 11  0 11  0
 11  0 11  0 11  0 11  0 11  0 11  0]
[11  3  2  7  1  0 13  9 10  5  6  0 13  8  0 13  0 13  0 13  0 13  0 13
  0 13  0 13  0 13  0 13  0 13  0 13]
Problem 3 | Cost: 14.812714 | Cost_constrained: 38.253456
[ 3  6  9  8  1 10  0  5  2  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
[12  1  5 10  0 13  2  9  8  6  0 13  3  0 13  0 13  0 13  0 13  0 13  0
 13  0 13  0 13  0 13  0 13  0 13  0]
Problem 4 | Cost: 7.159279 | Cost_constrained: 5.437902
[11  8  9  5  2  0 11  3  7  0 11  6  1  4  0 10  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 

In [5]:
trainer = RL4COTrainer(
    max_epochs=3,
    accelerator="gpu",
    devices=1,
    logger=None,
)

trainer_constrained = RL4COTrainer(
    max_epochs=3,
    accelerator="gpu",
    devices=1,
    logger=None,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [6]:
trainer.fit(model)

val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 695 K  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
695 K     Trainable params
0         Non-trainable params
695 K     Total params
2.781     Total estimated model params size (MB)
72        Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\hyosi\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 2: 100%|██████████| 196/196 [00:35<00:00,  5.60it/s, v_num=9, train/reward=-5.73, train/loss=-0.0854, val/reward=-5.67] 

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 196/196 [00:35<00:00,  5.59it/s, v_num=9, train/reward=-5.73, train/loss=-0.0854, val/reward=-5.67]


In [7]:
trainer_constrained.fit(model_constrained)

val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                 | Params | Mode 
----------------------------------------------------------
0 | env      | EVRPTWEnv            | 0      | train
1 | policy   | AttentionModelPolicy | 695 K  | train
2 | baseline | SharedBaseline       | 0      | train
----------------------------------------------------------
695 K     Trainable params
0         Non-trainable params
695 K     Total params
2.781     Total estimated model params size (MB)
72        Modules in train mode
0         Modules in eval mode


Epoch 2: 100%|██████████| 196/196 [00:56<00:00,  3.45it/s, v_num=10, train/reward=-5.44, train/loss=-0.0473, val/reward=-5.65]

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 196/196 [00:56<00:00,  3.45it/s, v_num=10, train/reward=-5.44, train/loss=-0.0473, val/reward=-5.65]


In [8]:
# Greedy rollouts over trained model (same states as previous plot)
policy = model.policy.to(device)
policy_constrained = model_constrained.policy.to(device)
out = policy(td_init.clone(), phase="test", decode_type="greedy", return_actions=True)
out_constrained = policy_constrained(td_init.clone(), phase="test", decode_type="greedy", return_actions=True)
actions_trained = out['actions'].cpu().detach()
rewards_trained = out['reward'].cpu().detach()
actions_trained_constrained = out_constrained['actions'].cpu().detach()
rewards_trained_constrained = out_constrained['reward'].cpu().detach()
# Plotting
import matplotlib.pyplot as plt
for i, td in enumerate(td_init):
    # fig, axs = plt.subplots(1,2, figsize=(11,5))
    print(f"Problem {i+1} | Untrained Cost: {-rewards_untrained[i]:.3f} | Trained Cost: {-rewards_trained[i]:.3f}")
    print(f"Problem {i+1} | Untrained_constrained Cost: {-rewards_untrained_constrained[i]:.3f} | \
          Trained Cost: {-rewards_trained_constrained[i]:.3f}")

    # env.render(td, actions_untrained[i], ax=axs[0])
    # env.render(td, actions_trained[i], ax=axs[1])
    # axs[0].set_title(f"Untrained | Cost = {-rewards_untrained[i].item():.3f}")
    # axs[1].set_title(r"Trained $\pi_\theta$" + f"| Cost = {-out['reward'][i].item():.3f}")
print(f'Untranied Mean Cost {-rewards_untrained.mean():.3f} | Trained Mean Cost  {-rewards_trained.mean():.3f}')
print(f'Untranied_Constrained Mean Cost {-rewards_untrained_constrained.mean():.3f} | Trained Mean Cost  {-rewards_trained_constrained.mean():.3f}')

Problem 1 | Untrained Cost: 33.895 | Trained Cost: 4.693
Problem 1 | Untrained_constrained Cost: 34.013 |           Trained Cost: 4.693
Problem 2 | Untrained Cost: 26.730 | Trained Cost: 3.705
Problem 2 | Untrained_constrained Cost: 29.204 |           Trained Cost: 3.705
Problem 3 | Untrained Cost: 14.813 | Trained Cost: 13.699
Problem 3 | Untrained_constrained Cost: 38.253 |           Trained Cost: 13.562
Problem 4 | Untrained Cost: 7.159 | Trained Cost: 4.371
Problem 4 | Untrained_constrained Cost: 5.438 |           Trained Cost: 4.962
Problem 5 | Untrained Cost: 7.437 | Trained Cost: 5.031
Problem 5 | Untrained_constrained Cost: 6.465 |           Trained Cost: 5.031
Problem 6 | Untrained Cost: 7.171 | Trained Cost: 3.954
Problem 6 | Untrained_constrained Cost: 35.305 |           Trained Cost: 3.954
Problem 7 | Untrained Cost: 16.627 | Trained Cost: 14.708
Problem 7 | Untrained_constrained Cost: 30.231 |           Trained Cost: 14.708
Problem 8 | Untrained Cost: 15.763 | Trained Cost

In [9]:
actions_trained

tensor([[ 2, 10,  3,  ...,  0,  0,  0],
        [ 4,  3,  7,  ...,  0,  0,  0],
        [ 7,  9,  6,  ...,  0,  0,  0],
        ...,
        [ 5, 10,  8,  ...,  0,  0,  0],
        [ 9,  5,  7,  ...,  0,  0,  0],
        [ 6,  5,  9,  ...,  0,  0,  0]])