In [1]:
%load_ext autoreload
%autoreload 2
%pip -q install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import optim

from agent import MyAgent
from models import DRQNetwork, MixingNetwork
from transferlearning import train

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cloned_policy_path = "./cloned_policy"
pretrained_mixer_path = "./pretrained_mixer"

fine_tuned_policy_path = "./fine_tuned_policy"
fine_tuned_mixer_path = "./fine_tuned_mixer"

cloned_policy_net = DRQNetwork.load(cloned_policy_path).to(device)
cloned_policy_net.train()
cloned_policy_net.gru.flatten_parameters()

pretrained_mixer = MixingNetwork.load(pretrained_mixer_path).to(device)
pretrained_mixer.train()

my_agent = MyAgent(
    num_agents=4,
    device=device,
    policy_net=cloned_policy_net,
    mixing_net=pretrained_mixer,
    buffer_size=2000,
    batch_sequence_length=20,
    batch_size=64,
    lr=1e-4,
    gamma=0.99,
    epsilon=1.0,
    epsilon_end=0.01,
    epsilon_decay=0.9995,
    target_update_freq=1,
    tau=1e-3,
    gradient_clipping_value=1,
)

my_agent.optimizer = optim.AdamW(
    list(my_agent.policy_net.parameters()) + list(my_agent.mixer.parameters()),
    lr=1e-5,
    weight_decay=1e-5,
    eps=1e-5,
)

trained_agent, all_rewards, all_losses = train(my_agent, num_episodes=100)

Using QMIX agent with mixer network
DRQNetwork(
  (feature_layer): Sequential(
    (0): Linear(in_features=42, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
  )
  (gru): GRU(512, 512, num_layers=2, batch_first=True)
  (output_layer): Linear(in_features=512, out_features=7, bias=True)
)
MixingNetwork(
  (hyper_w1): Sequential(
    (0): Linear(in_features=949, out_features=512, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=512, out_features=2048, bias=True)
  )
  (hyper_w2): Sequential(
    (0): Linear(in_features=949, out_features=512, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=512, out_features=512, bias=True)
  )
  (hyper_b1): Linear(in_features=949, out_features=512, bias=True)
  (hyper_b2): Sequential(
    (0): Linear(in_features=949, out_features=512, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=512, out_features=1, bias=True)
  )
)
Episode 1, Step 57, Reward: -54005.

  result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,


Episode 65, Step 14, Reward: -34000.88, Evacuated: 0, Deactivated: 4, MEAN TD LOSS: 2.89e+01
Episode 66, Step 122, Reward: -112977.44, Evacuated: 3, Deactivated: 1, MEAN TD LOSS: 1.99e+01
Episode 67, Step 58, Reward: -84003.42, Evacuated: 0, Deactivated: 4, MEAN TD LOSS: 1.70e+01
Episode 68, Step 24, Reward: -56001.26, Evacuated: 0, Deactivated: 4, MEAN TD LOSS: 1.59e+01
Episode 69, Step 500, Reward: -719056.45, Evacuated: 0, Deactivated: 2, MEAN TD LOSS: 1.31e+01
Episode 70, Step 15, Reward: -20001.41, Evacuated: 0, Deactivated: 4, MEAN TD LOSS: 1.22e+01
Episode 71, Step 12, Reward: -29000.21, Evacuated: 0, Deactivated: 4, MEAN TD LOSS: 1.05e+01
Episode 72, Step 18, Reward: -23001.94, Evacuated: 0, Deactivated: 4, MEAN TD LOSS: 1.07e+01
Episode 73, Step 500, Reward: -1051046.08, Evacuated: 0, Deactivated: 3, MEAN TD LOSS: 1.04e+01
Episode 74, Step 500, Reward: -478068.32, Evacuated: 0, Deactivated: 1, MEAN TD LOSS: 9.24e+00
Episode 75, Step 11, Reward: -26000.40, Evacuated: 0, Deactiv