In [1]:
%load_ext autoreload
%autoreload 2

import torch

from rl4co.envs import CVRPTWEnv, EVRPTWEnv 
from rl4co.models import AttentionModelPolicy, REINFORCE
from rl4co.utils.trainer import RL4COTrainer

In [2]:
env = EVRPTWEnv(generator_params={'num_loc': 5, 
                                  'num_station': 1,
                                  'vehicle_limit': 3,
                                  'vehicle_capacity': 2000,
                                  'max_time': 480,
                                  "horizon": 20000,
                                  'capacity': 2000,
                                  'max_fuel': 1000,
                                  'scale': True})

In [7]:
# Policy: neural network, in this case with encoder-decoder architecture
policy = AttentionModelPolicy(env_name=env.name,
                              embed_dim=64,
                              num_encoder_layers=3,
                              num_heads=4,
                              )

# RL Model: REINFORCE and greedy rollout baseline
model = REINFORCE(env,
                  policy,
                  baseline="rollout",
                  batch_size=64,
                  train_data_size=10_000,
                  val_data_size=1_000,
                  optimizer_kwargs={"lr": 1e-4},
                  )

In [15]:
# Greedy rollouts over untrained policy
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
td_init = env.reset(batch_size=[1]).to(device)
policy = policy.to(device)
out = policy(td_init, env=env, phase="test", decode_type="greedy", return_actions=True)

actions_untrained = out['actions'].cpu().detach()
rewards_untrained = out['reward'].cpu().detach()

for i in range(1):
    print(f"Problem {i+1} | Cost: {-rewards_untrained[i]:.3f}")
    # env.render(td_init[i], actions_untrained[i])

Actions are:  tensor([[1, 4, 2, 5, 0, 3, 0]])
To be implemented.
Problem 1 | Cost: 0.029


In [9]:
trainer = RL4COTrainer(
    max_epochs=3,
    accelerator="gpu",
    devices=1,
    logger=None,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(model)

val_file not set. Generating dataset instead
test_file not set. Generating dataset instead


Attention Mask:  torch.Size([64, 7])
Attention Mask:  torch.Size([64, 7])
Attention Mask:  torch.Size([64, 7])


AssertionError: Logits contain NaNs

In [None]:
# Greedy rollouts over trained model (same states as previous plot)
policy = model.policy.to(device)
out = policy(td_init.clone(), phase="test", decode_type="greedy", return_actions=True)
actions_trained = out['actions'].cpu().detach()
rewards_trained = out['reward'].cpu().detach()
# Plotting
import matplotlib.pyplot as plt
for i, td in enumerate(td_init):
    fig, axs = plt.subplots(1,2, figsize=(11,5))
    env.render(td, actions_untrained[i], ax=axs[0])
    env.render(td, actions_trained[i], ax=axs[1])
    axs[0].set_title(f"Untrained | Cost = {-rewards_untrained[i].item():.3f}")
    axs[1].set_title(r"Trained $\pi_\theta$" + f"| Cost = {-out['reward'][i].item():.3f}")

In [None]:
actions_trained