# DQL agent running on the Active Directory sample environment

In [2]:
from cyberbattle.simulation.model import *
import logging, sys, gym
sys.path.append("..")
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")
import numpy
from cyberbattle.simulation import model
from copy import deepcopy
from temporary_seed import temporary_seed
import random

In [3]:
ngyms = 9
gymids = [f"ActiveDirectory-v{i}" for i in range(0, ngyms)]
iteration_count = 1000

In [4]:
envs = [gym.make(gymid) for gymid in gymids]
map(lambda g : g.seed(1), envs)
ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=30,
    maximum_total_credentials=50,
    identifiers=envs[0].identifiers
)


In [5]:
# Evaluate the Deep Q-learning agent for each env using transfer learning
l = dqla.DeepQLearnerPolicy(
    ep=ep,
    gamma=0.015,
    replay_memory_size=10000,
    target_update=5,
    batch_size=512,
    learning_rate=0.01  # torch default learning rate is 1e-2
)
for (i, env) in enumerate(envs):
    epsilon = (10 - i) / 10
    # at least 1 runs and max 10 for the 10 envs
    training_episode_count = 1 + (9 - i)
    dqn_learning_run = learner.epsilon_greedy_search(
        cyberbattle_gym_env=env,
        environment_properties=ep,
        learner=l,
        episode_count=training_episode_count,
        iteration_count=iteration_count,
        epsilon=epsilon,
        epsilon_exponential_decay=50000,
        epsilon_minimum=0.1,
        verbosity=Verbosity.Quiet,
        render=False,
        plot_episodes_length=False,
        title=f"DQL {i}"
    )
    l = dqn_learning_run["learner"]


###### DQL 0
Learning with: episode_count=10,iteration_count=1000,ϵ=1.0,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/10 'DQL 0' ϵ=1.0000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   11.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 27|reward:   17.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 27|reward:   17.0|last_reward_at:   27|Elapsed Time: 0:00:00||
Episode 1|Iteration 35|reward:   28.0|last_reward_at:   27|Elapsed Time: 0:00:00||
Episode 1|Iteration 35|reward:   28.0|last_reward_at:   35|Elapsed Time: 0:00:00||
Episode 1|Iteration 36|reward:   61.0|last_reward_at:   35|Elapsed Time: 0:00:00||
Episode 1|Iteration 36|reward:   61.0|last_reward_at:   36|Elapsed Time: 0:00:00||
Episode 1|Iteration 49|reward:   66.0|last_reward_at:   36|Elapsed Time: 0:00:00||
Episode 1|Iteration 49|reward:   66.0|last_reward_at:   49|Elapsed Time: 0:00:00||
Episode 

  Episode 1 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 27/220 (0.11)
    explore-remote: 0/384 (0.00)
    explore-connect: 8/349 (0.02)
    exploit-local: 1/10 (0.09)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 1
  ## Episode: 2/10 'DQL 0' ϵ=0.9822, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   38.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   44.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   44.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 13|reward:   55.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 13|reward:   55.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 2|Iteration 24|reward:   61.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 2|Iteration 24|reward:   61.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 2|Iteration 34|reward:   66.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 2|Iteration 34|reward:   66.0|last_reward_at:   34|Elapsed Time: 0:00:00||
Episode 2|

  Episode 2 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 21/247 (0.08)
    explore-remote: 0/383 (0.00)
    explore-connect: 6/320 (0.02)
    exploit-local: 4/19 (0.17)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 3/10 'DQL 0' ϵ=0.9647, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   17.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   17.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   55.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   55.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   61.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   61.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iteration 26|reward:   61.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iteration 38|reward:   61.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|It

  Episode 3 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 18/220 (0.08)
    explore-remote: 0/371 (0.00)
    explore-connect: 4/339 (0.01)
    exploit-local: 6/33 (0.15)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/7 (0.22)
  exploit deflected to exploration: 0
  ## Episode: 4/10 'DQL 0' ϵ=0.9476, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   38.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   44.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   44.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 25|reward:   44.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 31|reward:   50.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 31|reward:   50.0|last_reward_at:   31|Elapsed Time: 0:00:00||
Episode 4|Iteration 36|reward:   61.0|last_reward_at:   31|Elapsed Time: 0:00:00||
Episode 4|Iteration 36|reward:   61.0|last_reward_at:   36|Elapsed Time: 0:00:00||
Episode 4|Iteration 48|reward:   62.0|last_reward_at:   36|Elapsed Time: 0:00:00||
Episode 4|

  Episode 4 ended at t=751 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 17/165 (0.09)
    explore-remote: 0/250 (0.00)
    explore-connect: 7/265 (0.03)
    exploit-local: 15/25 (0.38)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/5 (0.29)
  exploit deflected to exploration: 4
  ## Episode: 5/10 'DQL 0' ϵ=0.9350, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   38.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   44.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   44.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   45.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   45.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   51.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   51.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 5|Iteration 16|reward:   56.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 5|Iteration 16|reward:   56.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 5|

  Episode 5 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 17/238 (0.07)
    explore-remote: 0/334 (0.00)
    explore-connect: 6/342 (0.02)
    exploit-local: 13/38 (0.25)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/11 (0.08)
  exploit deflected to exploration: 5
  ## Episode: 6/10 'DQL 0' ϵ=0.9184, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   17.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   17.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 21|reward:   55.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 21|reward:   55.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 6|Iteration 36|reward:   61.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 6|Iteration 36|reward:   61.0|last_reward_at:   36|Elapsed Time: 0:00:00||
Episode 6|Iteration 51|reward:   61.0|last_reward_at:   36|Elapsed Time: 0:00:00||
Episode 6|Iteration 64|reward:   61.0|last_reward_at:   36|Elapsed Time: 0:00:00||
Episode 6|

  Episode 6 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/239 (0.04)
    explore-remote: 0/353 (0.00)
    explore-connect: 5/311 (0.02)
    exploit-local: 17/42 (0.29)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/20 (0.09)
  exploit deflected to exploration: 2
  ## Episode: 7/10 'DQL 0' ϵ=0.9022, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:   11.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   11.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 15|reward:   17.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 15|reward:   17.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 7|Iteration 18|reward:   28.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 7|Iteration 18|reward:   28.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 7|Iteration 31|reward:   61.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 7|Iteration 31|reward:   61.0|last_reward_at:   31|Elapsed Time: 0:00:00||
Episode 7|Iteration 38|reward:   62.0|last_reward_at:   31|Elapsed Time: 0:00:00||
Episode 

  Episode 7 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/234 (0.04)
    explore-remote: 0/315 (0.00)
    explore-connect: 3/342 (0.01)
    exploit-local: 19/40 (0.32)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/33 (0.11)
  exploit deflected to exploration: 6
  ## Episode: 8/10 'DQL 0' ϵ=0.8863, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   38.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 6|reward:   49.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 6|reward:   49.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   50.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   50.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   56.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   56.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 13|reward:   62.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 13|reward:   62.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 8|

  Episode 8 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/209 (0.05)
    explore-remote: 0/333 (0.00)
    explore-connect: 3/320 (0.01)
    exploit-local: 16/49 (0.25)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/55 (0.07)
  exploit deflected to exploration: 10
  ## Episode: 9/10 'DQL 0' ϵ=0.8708, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:   22.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   28.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   28.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward:   61.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward:   61.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 9|Iteration 18|reward:   62.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 9|Iteration 18|reward:   62.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 9|Iteration 21|reward:   68.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 9

  Episode 9 ended at t=419 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/88 (0.11)
    explore-remote: 0/144 (0.00)
    explore-connect: 2/106 (0.02)
    exploit-local: 21/26 (0.45)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/14 (0.33)
  exploit deflected to exploration: 4
  ## Episode: 10/10 'DQL 0' ϵ=0.8643, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   11.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   22.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   22.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   28.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   28.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   61.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   61.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 10|Iteration 26|reward:   61.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 10|Iteration 32|reward:   62.0|last_reward_at:   15|Elapsed Time: 0:00:00|

  Episode 10 ended at t=416 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/78 (0.13)
    explore-remote: 0/135 (0.00)
    explore-connect: 3/128 (0.02)
    exploit-local: 20/24 (0.45)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/10 (0.38)
  exploit deflected to exploration: 1
simulation ended
###### DQL 1
Learning with: episode_count=9,iteration_count=1000,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/9 'DQL 1' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   35.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   35.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   41.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   41.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 14|reward:   42.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 14|reward:   42.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 1|Iteration 23|reward:   53.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 1|Iteration 23|reward:   53.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode 1|It

  Episode 1 ended at t=749 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 14/168 (0.08)
    explore-remote: 0/242 (0.00)
    explore-connect: 6/252 (0.02)
    exploit-local: 9/20 (0.31)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/37 (0.03)
  exploit deflected to exploration: 4
  ## Episode: 2/9 'DQL 1' ϵ=0.8881, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   17.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   17.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   28.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   28.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   53.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   53.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 2|Iteration 26|reward:   53.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 2|Iteration 33|reward:   54.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 2|Iteration 33|reward:   54.0|last_reward_at:   33|Elapsed Time: 0:00:00||
Episode 2|I

  Episode 2 ended at t=619 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/139 (0.08)
    explore-remote: 0/194 (0.00)
    explore-connect: 3/199 (0.01)
    exploit-local: 15/17 (0.47)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/36 (0.10)
  exploit deflected to exploration: 1
  ## Episode: 3/9 'DQL 1' ϵ=0.8784, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   11.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   41.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   41.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   47.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   47.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|Iteration 35|reward:   47.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|Iteration 38|reward:   52.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|Iteration 38|reward:   52.0|last_reward_at:   38|Elapsed Time: 0:00:00||
Episode 3|Iteration 40|reward:   58.0|last_reward_at:   38|Elapsed Time: 0:00:00||
Episode 3|

  Episode 3 ended at t=841 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/168 (0.07)
    explore-remote: 0/270 (0.00)
    explore-connect: 2/271 (0.01)
    exploit-local: 13/17 (0.43)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/83 (0.06)
  exploit deflected to exploration: 2
  ## Episode: 4/9 'DQL 1' ϵ=0.8654, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   31.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   37.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   37.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   43.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   43.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 4|Iteration 19|reward:   49.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 4|Iteration 19|reward:   49.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 4|

  Episode 4 ended at t=613 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/133 (0.06)
    explore-remote: 0/208 (0.00)
    explore-connect: 4/183 (0.02)
    exploit-local: 17/13 (0.57)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/43 (0.07)
  exploit deflected to exploration: 3
  ## Episode: 5/9 'DQL 1' ϵ=0.8561, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   36.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   36.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   47.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   47.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 5|Iteration 16|reward:   53.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 5|Iteration 16|reward:   53.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 5|Iteration 25|reward:   54.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 5|Iteration 25|reward:   54.0|last_reward_at:   25|Elapsed Time: 0:00:00||
Episode 5|Iteration 28|reward:   60.0|last_reward_at:   25|Elapsed Time: 0:00:00||
Episode

  Episode 5 ended at t=476 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/110 (0.11)
    explore-remote: 0/147 (0.00)
    explore-connect: 2/144 (0.01)
    exploit-local: 14/9 (0.61)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/32 (0.14)
  exploit deflected to exploration: 0
  ## Episode: 6/9 'DQL 1' ϵ=0.8489, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   17.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   17.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   47.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   47.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 18|reward:   48.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 18|reward:   48.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 6|Iteration 19|reward:   54.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 6|Iteration 19|reward:   54.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 6|Iteration 20|reward:   60.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 6|I

  Episode 6 ended at t=496 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/105 (0.08)
    explore-remote: 0/173 (0.00)
    explore-connect: 3/135 (0.02)
    exploit-local: 18/9 (0.67)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/40 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 7/9 'DQL 1' ϵ=0.8416, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   36.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   47.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   47.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   48.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   48.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   53.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   53.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iterat

  Episode 7 ended at t=595 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/127 (0.08)
    explore-remote: 0/161 (0.00)
    explore-connect: 5/203 (0.02)
    exploit-local: 15/21 (0.42)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/50 (0.04)
  exploit deflected to exploration: 1
  ## Episode: 8/9 'DQL 1' ϵ=0.8328, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   36.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 6|reward:   37.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 6|reward:   37.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 8|Iteration 7|reward:   51.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 8|Iteration 7|reward:   51.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   57.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   57.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 8|Iteration 11|reward:   63.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 8|Ite

  Episode 8 ended at t=175 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/25 (0.22)
    explore-remote: 0/51 (0.00)
    explore-connect: 1/56 (0.02)
    exploit-local: 17/6 (0.74)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/6 (0.50)
  exploit deflected to exploration: 0
  ## Episode: 9/9 'DQL 1' ϵ=0.8302, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:   36.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:   36.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   37.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   37.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   43.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   43.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward:   54.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward:   54.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 9|

  Episode 9 ended at t=398 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/83 (0.08)
    explore-remote: 0/119 (0.00)
    explore-connect: 3/120 (0.02)
    exploit-local: 18/11 (0.62)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/33 (0.11)
  exploit deflected to exploration: 0
simulation ended
###### DQL 2
Learning with: episode_count=8,iteration_count=1000,ϵ=0.8,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/8 'DQL 2' ϵ=0.8000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   33.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   33.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   39.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   39.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   45.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   45.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 1|Iteration 22|reward:   56.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 1|Iteration 22|reward:   56.0|last_reward_at:   22|Elapsed Time: 0:00:00||
Episode 1|Iteration 31|reward:   57.0|last_reward_at:   22|Elapsed Time: 0:00:00||
Episode 1|Iteration 31|reward:   57.0|last_reward_at:   31|Elapsed Time: 0:00:00||
Episode 1|

  Episode 1 ended at t=198 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/44 (0.17)
    explore-remote: 0/64 (0.00)
    explore-connect: 1/51 (0.02)
    exploit-local: 14/3 (0.82)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/7 (0.42)
  exploit deflected to exploration: 1
  ## Episode: 2/8 'DQL 2' ϵ=0.7972, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   33.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   33.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   44.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   44.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   45.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   45.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   51.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   51.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   56.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   56.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   62.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iterat

  Episode 2 ended at t=163 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/28 (0.15)
    explore-remote: 0/52 (0.00)
    explore-connect: 1/42 (0.02)
    exploit-local: 15/1 (0.94)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/14 (0.26)
  exploit deflected to exploration: 0
  ## Episode: 3/8 'DQL 2' ϵ=0.7950, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   33.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   33.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   44.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   44.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 21|reward:   49.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 21|reward:   49.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 3|Iteration 26|reward:   55.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 3|Iteration 26|reward:   55.0|last_reward_at:   26|Elapsed Time: 0:00:00||
Episode 3|Iteration 30|reward:   61.0|last_reward_at:   26|Elapsed Time: 0:00:00||
Episode 3|Iteration 30|reward:   61.0|last_reward_at:   30|Elapsed Time: 0:00:00||
Episode 3|Iteration 41|reward:   67.0|last_reward_at:   30|Elapsed Time: 0:00:00||
Episode 3

  Episode 3 ended at t=229 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/36 (0.10)
    explore-remote: 0/66 (0.00)
    explore-connect: 3/73 (0.04)
    exploit-local: 15/8 (0.65)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/21 (0.12)
  exploit deflected to exploration: 1
  ## Episode: 4/8 'DQL 2' ϵ=0.7918, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   39.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   39.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   45.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   45.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iteration 13|reward:   50.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iteration 13|reward:   50.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 4|Iteration 16|reward:   61.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 4|Iteration 16|reward:   61.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 4|Iteration 17|reward:   67.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 4

  Episode 4 ended at t=347 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/61 (0.12)
    explore-remote: 0/105 (0.00)
    explore-connect: 2/112 (0.02)
    exploit-local: 13/8 (0.62)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/34 (0.11)
  exploit deflected to exploration: 1
  ## Episode: 5/8 'DQL 2' ϵ=0.7870, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   33.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   33.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   44.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   44.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 7|reward:   49.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 7|reward:   49.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   55.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   55.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   61.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   61.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 5|Iter

  Episode 5 ended at t=207 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/47 (0.13)
    explore-remote: 0/49 (0.00)
    explore-connect: 4/58 (0.06)
    exploit-local: 15/12 (0.56)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/13 (0.13)
  exploit deflected to exploration: 0
  ## Episode: 6/8 'DQL 2' ϵ=0.7842, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   33.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   33.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   39.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   39.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   45.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   45.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 15|reward:   56.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 15|reward:   56.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 6|Iteration 16|reward:   57.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 6|Iteration 16|reward:   57.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 6|Iteration 26|reward:   57.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 6|I

  Episode 6 ended at t=207 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/35 (0.12)
    explore-remote: 0/61 (0.00)
    explore-connect: 1/55 (0.02)
    exploit-local: 17/1 (0.94)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/27 (0.16)
  exploit deflected to exploration: 1
  ## Episode: 7/8 'DQL 2' ϵ=0.7814, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   17.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   17.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   50.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   50.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 12|reward:   56.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 12|reward:   56.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 7|Iteration 17|reward:   57.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 7|Iteration 17|reward:   57.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 7|It

  Episode 7 ended at t=329 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/71 (0.08)
    explore-remote: 0/99 (0.00)
    explore-connect: 1/84 (0.01)
    exploit-local: 17/6 (0.74)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/40 (0.11)
  exploit deflected to exploration: 1
  ## Episode: 8/8 'DQL 2' ϵ=0.7769, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 11|reward:   39.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 11|reward:   39.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 8|Iteration 21|reward:   44.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 8|Iteration 21|reward:   44.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 8|Iteration 24|reward:   55.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 8|Iteration 24|reward:   55.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 8|Iteration 25|reward:   61.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 8|Iteration 25|reward:   61.0|last_reward_at:   25|Elapsed Time: 0:00:00||
Episode 

  Episode 8 ended at t=128 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/19 (0.24)
    explore-remote: 0/33 (0.00)
    explore-connect: 3/44 (0.06)
    exploit-local: 14/2 (0.88)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/4 (0.43)
  exploit deflected to exploration: 0
simulation ended
###### DQL 3
Learning with: episode_count=7,iteration_count=1000,ϵ=0.7,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/7 'DQL 3' ϵ=0.7000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   49.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   49.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   50.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   50.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   56.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   56.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   62.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   62.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 10|reward:   68.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Itera

  Episode 1 ended at t=419 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/54 (0.04)
    explore-remote: 0/112 (0.00)
    explore-connect: 2/126 (0.02)
    exploit-local: 25/9 (0.74)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/84 (0.06)
  exploit deflected to exploration: 0
  ## Episode: 2/7 'DQL 3' ϵ=0.6950, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   17.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   17.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   55.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   55.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   61.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   61.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   62.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   62.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iterat

  Episode 2 ended at t=179 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/27 (0.18)
    explore-remote: 0/46 (0.00)
    explore-connect: 2/48 (0.04)
    exploit-local: 18/2 (0.90)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/25 (0.17)
  exploit deflected to exploration: 0
  ## Episode: 3/7 'DQL 3' ϵ=0.6929, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   38.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   49.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   49.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   55.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   55.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 15|reward:   61.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 15|reward:   61.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   62.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   62.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|Iteration 21|reward:   68.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|I

  Episode 3 ended at t=145 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/35 (0.05)
    explore-remote: 0/31 (0.00)
    explore-connect: 1/32 (0.03)
    exploit-local: 22/3 (0.88)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/13 (0.32)
  exploit deflected to exploration: 0
  ## Episode: 4/7 'DQL 3' ϵ=0.6912, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   38.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   44.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   44.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   55.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   55.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 9|reward:   56.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 9|reward:   56.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   62.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   62.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iteration 14|reward:   68.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Ite

  Episode 4 ended at t=343 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/73 (0.10)
    explore-remote: 0/86 (0.00)
    explore-connect: 2/89 (0.02)
    exploit-local: 18/12 (0.60)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/50 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 5/7 'DQL 3' ϵ=0.6871, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   38.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   49.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   49.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   55.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   55.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   61.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   61.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|Iteration 26|reward:   61.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|Iteration 34|reward:   66.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|

  Episode 5 ended at t=226 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/46 (0.13)
    explore-remote: 0/68 (0.00)
    explore-connect: 1/55 (0.02)
    exploit-local: 18/7 (0.72)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/18 (0.25)
  exploit deflected to exploration: 3
  ## Episode: 6/7 'DQL 3' ϵ=0.6845, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   38.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   38.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:   49.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:   49.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 6|Iteration 14|reward:   55.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 6|Iteration 14|reward:   55.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 6|Iteration 21|reward:   60.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 6|Iteration 21|reward:   60.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 6|Iteration 23|reward:   66.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 6|Iteration 23|reward:   66.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode 6|Iteration 27|reward:   72.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode

  Episode 6 ended at t=196 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/30 (0.06)
    explore-remote: 0/54 (0.00)
    explore-connect: 1/39 (0.03)
    exploit-local: 24/5 (0.83)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/35 (0.15)
  exploit deflected to exploration: 1
  ## Episode: 7/7 'DQL 3' ϵ=0.6822, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   44.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   44.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 7|reward:   55.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 7|reward:   55.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 7|Iteration 14|reward:   61.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 7|Iteration 14|reward:   61.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 7|Iteration 15|reward:   62.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 7|Iteration 15|reward:   62.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 7|It

  Episode 7 ended at t=145 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/28 (0.07)
    explore-remote: 0/36 (0.00)
    explore-connect: 1/32 (0.03)
    exploit-local: 23/2 (0.92)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/15 (0.29)
  exploit deflected to exploration: 1
simulation ended
###### DQL 4
Learning with: episode_count=6,iteration_count=1000,ϵ=0.6,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/6 'DQL 4' ϵ=0.6000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   46.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   46.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   57.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   57.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   63.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   63.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   64.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   64.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   70.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   70.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iterat

  Episode 1 ended at t=210 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/19 (0.05)
    explore-remote: 0/41 (0.00)
    explore-connect: 1/58 (0.02)
    exploit-local: 23/16 (0.59)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/45 (0.12)
  exploit deflected to exploration: 0
  ## Episode: 2/6 'DQL 4' ϵ=0.5979, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   57.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   57.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   63.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   63.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   69.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   69.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   70.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   70.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iterat

  Episode 2 ended at t=245 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/24 (0.14)
    explore-remote: 0/44 (0.00)
    explore-connect: 1/57 (0.02)
    exploit-local: 20/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/89 (0.06)
  exploit deflected to exploration: 0
  ## Episode: 3/6 'DQL 4' ϵ=0.5955, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   11.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   57.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   57.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   63.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   63.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   69.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   69.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   70.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   70.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 3|Iterat

  Episode 3 ended at t=113 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/15 (0.25)
    explore-remote: 0/23 (0.00)
    explore-connect: 1/25 (0.04)
    exploit-local: 20/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/18 (0.25)
  exploit deflected to exploration: 0
  ## Episode: 4/6 'DQL 4' ϵ=0.5944, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   46.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   46.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   57.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   57.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   63.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   63.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   64.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   64.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   70.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   70.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   76.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iterat

  Episode 4 ended at t=126 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/17 (0.11)
    explore-remote: 0/32 (0.00)
    explore-connect: 1/23 (0.04)
    exploit-local: 23/2 (0.92)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/20 (0.23)
  exploit deflected to exploration: 0
  ## Episode: 5/6 'DQL 4' ϵ=0.5931, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   46.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   46.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   57.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   57.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   63.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   63.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   64.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   64.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   70.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   70.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   76.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iterat

  Episode 5 ended at t=164 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 3/25 (0.11)
    explore-remote: 0/33 (0.00)
    explore-connect: 2/37 (0.05)
    exploit-local: 23/5 (0.82)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/31 (0.14)
  exploit deflected to exploration: 0
  ## Episode: 6/6 'DQL 4' ϵ=0.5915, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   46.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   46.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   52.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   52.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   58.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   58.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   69.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   69.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|Iteration 13|reward:   70.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|Iteration 13|reward:   70.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 6|It

  Episode 6 ended at t=127 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/22 (0.04)
    explore-remote: 0/33 (0.00)
    explore-connect: 2/24 (0.08)
    exploit-local: 23/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/17 (0.23)
  exploit deflected to exploration: 0
simulation ended
###### DQL 5
Learning with: episode_count=5,iteration_count=1000,ϵ=0.5,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/5 'DQL 5' ϵ=0.5000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   17.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   17.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   77.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   77.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   83.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   83.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   84.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   84.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iterat

  Episode 1 ended at t=259 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/31 (0.06)
    explore-remote: 0/52 (0.00)
    explore-connect: 1/43 (0.02)
    exploit-local: 35/22 (0.61)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/64 (0.12)
  exploit deflected to exploration: 0
  ## Episode: 2/5 'DQL 5' ϵ=0.4979, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   60.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   60.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   66.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   66.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   77.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   77.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   83.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   83.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   84.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   84.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iterat

  Episode 2 ended at t=148 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/20 (0.09)
    explore-remote: 0/29 (0.00)
    explore-connect: 0/23 (0.00)
    exploit-local: 35/2 (0.95)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 10/27 (0.27)
  exploit deflected to exploration: 0
  ## Episode: 3/5 'DQL 5' ϵ=0.4968, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   60.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   60.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   71.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   71.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   77.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   77.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   83.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   83.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   84.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   84.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   90.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iterat

  Episode 3 ended at t=247 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 0/32 (0.00)
    explore-remote: 0/53 (0.00)
    explore-connect: 1/42 (0.02)
    exploit-local: 37/3 (0.93)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/70 (0.11)
  exploit deflected to exploration: 0
  ## Episode: 4/5 'DQL 5' ϵ=0.4948, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   60.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   60.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   71.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   71.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   77.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   77.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   83.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   83.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   84.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   84.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   85.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Ite

  Episode 4 ended at t=491 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/67 (0.06)
    explore-remote: 0/96 (0.00)
    explore-connect: 1/79 (0.01)
    exploit-local: 34/4 (0.89)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/197 (0.04)
  exploit deflected to exploration: 0
  ## Episode: 5/5 'DQL 5' ϵ=0.4910, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   60.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   60.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   71.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   71.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   77.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   77.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   83.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   83.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   84.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   84.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|Iteration 14|reward:   90.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|Ite

  Episode 5 ended at t=251 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/34 (0.06)
    explore-remote: 0/31 (0.00)
    explore-connect: 1/51 (0.02)
    exploit-local: 35/15 (0.70)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/73 (0.11)
  exploit deflected to exploration: 0
simulation ended
###### DQL 6
Learning with: episode_count=4,iteration_count=1000,ϵ=0.4,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/4 'DQL 6' ϵ=0.4000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   41.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   41.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   52.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   52.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   58.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   58.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   64.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   64.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   65.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   65.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|Iteration 15|reward:   71.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|Ite

  Episode 1 ended at t=237 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/30 (0.03)
    explore-remote: 0/32 (0.00)
    explore-connect: 1/35 (0.03)
    exploit-local: 37/3 (0.93)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/89 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 2/4 'DQL 6' ϵ=0.3986, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   41.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   41.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   52.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   52.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   58.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   58.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   64.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   64.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   65.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   65.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   74.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iterat

  Episode 2 ended at t=152 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/10 (0.17)
    explore-remote: 0/33 (0.00)
    explore-connect: 1/9 (0.10)
    exploit-local: 34/6 (0.85)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/48 (0.16)
  exploit deflected to exploration: 0
  ## Episode: 3/4 'DQL 6' ϵ=0.3977, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   41.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   41.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   52.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   52.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   58.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   58.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   64.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   64.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   69.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   69.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   75.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iterat

  Episode 3 ended at t=166 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 0/16 (0.00)
    explore-remote: 0/26 (0.00)
    explore-connect: 0/25 (0.00)
    exploit-local: 37/2 (0.95)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 10/50 (0.17)
  exploit deflected to exploration: 0
  ## Episode: 4/4 'DQL 6' ϵ=0.3967, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   41.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   41.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   52.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   52.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   58.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   58.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   59.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   59.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   65.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   65.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   74.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iterat

  Episode 4 ended at t=277 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/31 (0.03)
    explore-remote: 0/41 (0.00)
    explore-connect: 1/38 (0.03)
    exploit-local: 32/6 (0.84)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/118 (0.07)
  exploit deflected to exploration: 0
simulation ended
###### DQL 7
Learning with: episode_count=3,iteration_count=1000,ϵ=0.3,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/3 'DQL 7' ϵ=0.3000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   41.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   41.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   47.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   47.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   53.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   53.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   58.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   58.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   59.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 1|Iterat

  Episode 1 ended at t=95 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/6 (0.14)
    explore-remote: 0/11 (0.00)
    explore-connect: 1/9 (0.10)
    exploit-local: 26/3 (0.90)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/31 (0.18)
  exploit deflected to exploration: 0
  ## Episode: 2/3 'DQL 7' ϵ=0.2996, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   31.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   31.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   45.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   45.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   51.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   51.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   57.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   57.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 6|reward:   63.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iterat

  Episode 2 ended at t=145 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/10 (0.09)
    explore-remote: 0/13 (0.00)
    explore-connect: 1/13 (0.07)
    exploit-local: 28/2 (0.93)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/70 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 3/3 'DQL 7' ϵ=0.2990, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   41.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   41.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   47.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   47.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   53.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   53.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   58.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   58.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   64.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iterat

  Episode 3 ended at t=149 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/14 (0.12)
    explore-remote: 0/9 (0.00)
    explore-connect: 1/20 (0.05)
    exploit-local: 26/1 (0.96)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/69 (0.09)
  exploit deflected to exploration: 1
simulation ended
###### DQL 8
Learning with: episode_count=2,iteration_count=1000,ϵ=0.2,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/2 'DQL 8' ϵ=0.2000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   41.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   41.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   47.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   47.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   53.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   53.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   54.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   54.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   60.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iterat

  Episode 1 ended at t=93 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 0/10 (0.00)
    explore-remote: 0/9 (0.00)
    explore-connect: 0/2 (0.00)
    exploit-local: 24/1 (0.96)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/40 (0.15)
  exploit deflected to exploration: 0
  ## Episode: 2/2 'DQL 8' ϵ=0.1998, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   30.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   41.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   41.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   47.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   47.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   53.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   53.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   54.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   54.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iterat

  Episode 2 ended at t=89 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 0/5 (0.00)
    explore-remote: 0/5 (0.00)
    explore-connect: 0/3 (0.00)
    exploit-local: 24/1 (0.96)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/44 (0.14)
  exploit deflected to exploration: 0
simulation ended


In [6]:

class GenerateConfigs:
    def __init__(self, environment, timesteps, probability):
        self.environment = environment
        self.timesteps = timesteps
        self.probability = probability

        for t in range(self.timesteps):
            self.step()

    def patch_vulnerabilities_at_random(self) -> None:
        # Iterate through every node.
        for node_id, node_data in self.environment.nodes():
            # Have a boolean remove_vulnerability decide if we will remove one.
            remove_vulnerability = numpy.random.random() <= self.probability
            if remove_vulnerability and len(node_data.vulnerabilities) > 0:
                choice = random.choice(list(node_data.vulnerabilities))
                node_data.vulnerabilities.pop(choice)

    def plant_vulnerabilities_at_random(self) -> None:
        for node_id, node_data in self.environment.nodes():
            add_vulnerability = numpy.random.random() <= self.probability
            # See all differences between current node vulnerabilities and global ones.
            new_vulnerabilities = numpy.setdiff1d(
                list(self.environment.vulnerability_library.keys()), list(node_data.vulnerabilities.keys()))
            # If we have decided that we will add a vulnerability and there are new vulnerabilities not already
            # on the node, then add them.
            if add_vulnerability and len(new_vulnerabilities) > 0:
                new_vulnerability = random.choice(new_vulnerabilities)
                node_data.vulnerabilities[new_vulnerability] = \
                    self.environment.vulnerability_library[new_vulnerability]
                
    def firewall_change_remove(self) -> None:
        # Iterate through every node.
        for node_id, node_data in self.environment.nodes():
            # Have a boolean remove_rule decide if we will remove one.
            remove_rule = numpy.random.random() <= self.probability
            # The following logic sees if there are both incoming and outgoing rules.
            # If there are, we remove one randomly.
            if remove_rule and len(node_data.firewall.outgoing) > 0 and len(node_data.firewall.incoming) > 0:
                incoming = numpy.random.random() <= 0.5
                if incoming:
                    rule_to_remove = random.choice(node_data.firewall.incoming)
                    node_data.firewall.incoming.remove(rule_to_remove)
                else:
                    rule_to_remove = random.choice(node_data.firewall.outgoing)
                    node_data.firewall.outgoing.remove(rule_to_remove)
            # If there are only outgoing rules, we remove one random outgoing rule.
            elif remove_rule and len(node_data.firewall.outgoing) > 0:
                rule_to_remove = random.choice(node_data.firewall.outgoing)
                node_data.firewall.outgoing.remove(rule_to_remove)
            # If there are only incoming rules, we remove one random incoming rule.
            elif remove_rule and len(node_data.firewall.incoming) > 0:
                rule_to_remove = random.choice(node_data.firewall.incoming)
                node_data.firewall.incoming.remove(rule_to_remove)

    def firewall_change_add(self) -> None:
        # Iterate through every node.
        for node_id, node_data in self.environment.nodes():
            # Have a boolean rule_to_add decide if we will add one.
            add_rule = numpy.random.random() <= self.probability
            if add_rule:
                # 0 For allow, 1 for block.
                rule_to_add = model.FirewallRule(port=random.choice(model.SAMPLE_IDENTIFIERS.ports),
                                                 permission=model.RulePermission.ALLOW)
                # Randomly decide if we will add an incoming or outgoing rule.
                incoming = numpy.random.random() <= 0.5
                if incoming and rule_to_add not in node_data.firewall.incoming:
                    node_data.firewall.incoming.append(rule_to_add)
                elif not incoming and rule_to_add not in node_data.firewall.incoming:
                    node_data.firewall.outgoing.append(rule_to_add)

    def number_of_open_connections(self):
        n_connections = 0
        for _, node_data in self.environment.nodes():
            n_connections += len(node_data.firewall.incoming)
            n_connections += len(node_data.firewall.outgoing)
        return n_connections

    def step(self):
        self.patch_vulnerabilities_at_random()
        self.plant_vulnerabilities_at_random()
        self.firewall_change_remove()
        self.firewall_change_add()

    def get_environment(self):
        return self.environment

In [11]:
env_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
seeds = numpy.arange(0, 25).tolist()
network_config_data = []
for env_id in env_ids:
    network_config_data.append([])
    for seed in seeds:
        network_config_data[env_id].append([])


for env_id in env_ids:
    # seeds = numpy.arange(0, 50).tolist()
    seeds = numpy.arange(25).tolist()
    config_agent_rewards = []
    tiny = gym.make(f'ActiveDirectory-v{env_id}')

    for seed in seeds:
        
        no_action_flag = False
        with temporary_seed(seed):
            if(numpy.random.random() < 0.5):
                tiny= gym.make(f'ActiveDirectory-v{env_id}')
        current_o = tiny.reset()
        #Add a layer that changes configs, firewall and vulnerabilites, and then evaluate the reward
        if(seed!=0):
            random.seed(seed)
            with temporary_seed(seed):
                probability = numpy.random.random()
                timesteps = int(numpy.random.random()*10)
                config_gen = GenerateConfigs(tiny.environment, probability=probability, timesteps=timesteps)
                tiny.environment = deepcopy(config_gen.get_environment())

        else:
            config_gen = GenerateConfigs(tiny.environment, probability=0, timesteps=0)

        wrapped_env = AgentWrapper(tiny, ActionTrackingStateAugmentation(ep, current_o))
        # Use the trained agent to run the steps one by one
        max_steps = 1000
        cum_r = 0
        # next action suggested by DQL agent
        # h = []
        for i in range(max_steps):
            # run the suggested action
            _, next_action, _ = l.exploit(wrapped_env, current_o)
            # h.append((tiny.get_explored_network_node_properties_bitmap_as_numpy(current_o), next_action))
            if next_action is None:
                # print("No more learned moves")
                no_action_flag = True
                break
            current_o, r, is_done, _ = wrapped_env.step(next_action)
            cum_r += r
            if is_done:
                print("Finished simulation")
                break
        # tiny.render()
        print(f"Total reward: {cum_r}, no_action_flag: {no_action_flag}, no_connections: {config_gen.number_of_open_connections()}")
        network_config_data[env_id][seed].append([cum_r, no_action_flag, config_gen.number_of_open_connections()])
    # id_spreads.append(numpy.std(config_agent_rewards))
    # id_reward.append(numpy.mean(config_agent_rewards))

Total reward: 121.0, no_action_flag: False, no_connections: 60
Total reward: 0.0, no_action_flag: True, no_connections: 50
Total reward: 150.0, no_action_flag: False, no_connections: 60
Total reward: 150.0, no_action_flag: False, no_connections: 60
Total reward: 150.0, no_action_flag: False, no_connections: 90
Total reward: 0.0, no_action_flag: True, no_connections: 30
Total reward: 156.0, no_action_flag: False, no_connections: 50
Total reward: 11.0, no_action_flag: True, no_connections: 20
Total reward: 150.0, no_action_flag: False, no_connections: 60
Total reward: 150.0, no_action_flag: False, no_connections: 60
Total reward: 150.0, no_action_flag: False, no_connections: 60
Total reward: 150.0, no_action_flag: False, no_connections: 60
Total reward: 61.0, no_action_flag: False, no_connections: 90
Total reward: 156.0, no_action_flag: False, no_connections: 60
Total reward: 150.0, no_action_flag: False, no_connections: 30
Total reward: 156.0, no_action_flag: False, no_connections: 60
T

In [None]:
numpy.save('config_data', numpy.array(network_config_data))