In [1]:
#   ____                            _____     _____                   __          __        _        _                 
#  / __ \                     /\   |_   _|   / ____|                  \ \        / /       | |      | |                
# | |  | |_ __   ___ _ __    /  \    | |    | |  __ _   _ _ __ ___     \ \  /\  / /__  _ __| | _____| |__   ___  _ __  
# | |  | | '_ \ / _ \ '_ \  / /\ \   | |    | | |_ | | | | '_ ` _ \     \ \/  \/ / _ \| '__| |/ / __| '_ \ / _ \| '_ \ 
# | |__| | |_) |  __/ | | |/ ____ \ _| |_   | |__| | |_| | | | | | |     \  /\  / (_) | |  |   <\__ \ | | | (_) | |_) |
#  \____/| .__/ \___|_| |_/_/    \_\_____|   \_____|\__, |_| |_| |_|      \/  \/ \___/|_|  |_|\_\___/_| |_|\___/| .__/ 
#        | |                                         __/ |                                                      | |    
#        |_|                                        |___/                                                       |_|    

# 10 minutes break ?

In [None]:
!pip install gym
#!pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html



In [None]:
import json
import pathlib
import random
from itertools import count
from random import randint

import gym
from gym import spaces

from typing import Optional, Dict, Tuple, Callable, Any, List

import collections

import torch
from torch import Tensor

In [None]:
class MultiAgentTDMAGym(gym.Env):
  NO_ACTION = 0  # no action on sub-frame
  UPLINK_ACTION = 1  # use sub-frame as up-link role (user [Tx] --> base-station [Rx])
  DOWNLINK_ACTION = 2  # use sub-frame as down-link role (base-station [Tx] --> user [Rx])

  BASE_STATION_AGENT = 0  # base-station agent type value
  USER_AGENT = 1  # user agent type value
  JAMMER_AGENT = 2  # jammer agent type value

  def __init__(
      self, n_freqs: int, n_subframes: int, horizon: int,
      use_jammer: bool = True
  ):
    super(MultiAgentTDMAGym, self).__init__()  # initialize gym.Env base class
    if n_subframes % 2 != 0:  # since a link is separated in up and down link we need even number of subframe
      raise ValueError('n_subframes must be an even number')

    self.freq_act_space = spaces.Discrete(n_freqs)  # frequency action space
    self.sf_act_space = spaces.MultiDiscrete([3] * n_subframes)  # sub-frames action space
    self.time_step_obs_space = spaces.Discrete(horizon + 1)  # time-step action space

    # multi-agent action space dict includes frequency and sub-frames action for each type
    self.action_space = spaces.Dict(
      {
        'user': spaces.Dict(
          {'freq': self.freq_act_space, 'subframes': self.sf_act_space}  # frequency and subframes user agent action space
        ),
        'base_station': spaces.Dict(
          {'freq': self.freq_act_space, 'subframes': self.sf_act_space}  # frequency and subframes base-station agent action space
        ),
        'jammer': spaces.Dict(
          {'freq': self.freq_act_space, 'subframes': self.sf_act_space}  # frequency and subframes jammer agent action space
        )
      }
    )
    # multi-agent observation space, different space for all agent type 
    self.observation_space = spaces.Dict(
      {
        'user': spaces.Dict(
          {
            'time_step': self.time_step_obs_space,  # time-step observation, including terminal time-step
            'last_a': self.action_space['user'],  # last action done by the agent
            'last_success': spaces.MultiDiscrete([2] * n_subframes)  # what sub-frames received an acknowledgement
          }
        ),
        'base_station': spaces.Dict(
          {'time_step': self.time_step_obs_space}  # time-step observation, including terminal time-step
        ),
        'jammer': spaces.Dict(
          {
            'time_step': self.time_step_obs_space,  # time-step observation, including terminal time-step
            'last_a': self.action_space['jammer'],  # last action done by the agent
          }
        )
      }
    )
    self.reward_range = 0.0, float(n_subframes)  # action-reward range between 0 and the number of sub-frames

    self.use_jammer = use_jammer  # tell the gym to consider jammer agent
    self.hor = horizon  # gym horizon to know when we are DONE
    self.t = 0  # initial time-step / observation

  def step(self, action: Dict) -> Tuple[
    Dict, Dict[str, float], bool, Dict
  ]:
    if self.t >= self.hor:  # terminal state reached
      return collections.OrderedDict(), {}, True, {}  # return terminal empty transition

    assert self.action_space['user'].contains(action['user'])  # assert user action is within user action space
    assert self.action_space['base_station'].contains(action['base_station'])  # assert base-station action is within action space
    assert self.action_space['jammer'].contains(action['jammer'])  # assert jammer action is within action space

    bs_a = action['base_station']  # base-station action
    j_a = action['jammer']  # jammer action
    u_a = action['user']  # user action

    if bs_a['freq'] == u_a['freq']:  # base-station and user agents have the same frequency
      if u_a['freq'] == j_a['freq'] and self.use_jammer:  # base-station, user and jammer have the same frequency, jammer is considered
        sf_success = [
          int(
            bs_sf == u_sf and u_sf != self.NO_ACTION and  # base-station and user have same sub-frames role
            (u_sf != j_sf or j_sf == self.NO_ACTION)    # jammer must not have same role for user and base-station successes
          ) for bs_sf, u_sf, j_sf in zip(
            bs_a['subframes'], u_a['subframes'], j_a['subframes']  # for all sub-frames
          )
        ]
      else:  # jammer is no considered, base-station and user have the same frequency
        sf_success = [
          int(bs_sf == u_sf and u_sf != self.NO_ACTION)  # action sub-frames role is correct w.r.t. the base-station's sub-frames
          for bs_sf, u_sf in zip(bs_a['subframes'], u_a['subframes'])
        ]
    else:
      sf_success = [0] * len(self.sf_act_space.nvec)  # wrong frequency, no sub-frames successes

    self.t += 1  # increment our time-step / observation
    o = collections.OrderedDict(
      user=collections.OrderedDict(
        time_step=self.t, last_a=bs_a, last_success=sf_success   # user obs: time-step, last action, last successes
      ),
      base_station=collections.OrderedDict(time_step=self.t),  # base-station obs: only time-step
      jammer=collections.OrderedDict(time_step=self.t, last_a=j_a)  # jammer obs: time-step and last action
    )
    assert self.observation_space.contains(o)  # assert obs is within observation space

    r = {
      'base_station': float(sum(sf_success)),  # sum successes to give the reward
      'user': float(sum(sf_success)),  # sum successes to give the reward
      'jammer': 0.0  # jammer always a reward of 0
    }
    return o, r, False, {}  # gyms always returns <obs, reward, if terminal obs reached, debug/info dictionary>

  def reset(self) -> Dict:  # reset our gym for a new episode
    self.t = 0  # initial time-step
    o = collections.OrderedDict( 
      base_station=collections.OrderedDict(time_step=self.t),
      user=collections.OrderedDict(time_step=self.t),
      jammer=collections.OrderedDict(time_step=self.t)
    )
    return o  # initial observation, only the time-step

  def render(self, mode='human'):  # gym visual rendering (e.g. text, image, plot, 3D frame, etc.)
    pass

In [None]:
class TDMABaseStationAgent:

  def __init__(self, gym_env: MultiAgentTDMAGym):
    self.act_space = gym_env.action_space['base_station']  # base-station action space from gym
    self.obs_space = gym_env.observation_space['base_station']  # base-station observation space from gym

    horizon = self.obs_space['time_step'].n  # horizon to know when we are DONE
    n_freqs = self.act_space['freq'].n  # number of frequencies to use as base-station
    n_subframes = len(self.act_space['subframes'].nvec)  # number of sub-frames to use as base-station

    self.freq_sf: Dict[int, Tuple[int, Tensor]] = dict()  # frequency and sub-frames role assignation w.r.t. to time-step
    for t in range(horizon):  # do not include terminal state
      freq = randint(0, n_freqs - 1)  # choose random frequency to do Tx
      ul_subframes = random.sample(range(n_subframes), k=n_subframes // 2)  # choose which sub-frames will be used as up-link
      dl_subframes = [
        sf for sf in range(n_subframes) if sf not in ul_subframes  # choose the rest of sub-frames as down-link
      ]
      sf = torch.zeros(n_subframes, dtype=torch.int)  # initialize sub-frames roles
      sf.fill_(gym_env.NO_ACTION)  # set all sub-frames to no-action
      sf[ul_subframes] = gym_env.UPLINK_ACTION  # set all up-link sub-frames to uplink-action value
      sf[dl_subframes] = gym_env.DOWNLINK_ACTION  # set all down-link sub-frames to downlink-action value
      self.freq_sf[t] = freq, sf  # set sub-frames role w.r.t. time-step

    terminal_sf = torch.zeros(n_subframes, dtype=torch.int)  # terminal observation sub-frames
    terminal_sf.fill_(gym_env.NO_ACTION)  # set terminal observation sub-frames value to no-action
    self.freq_sf[horizon] = 0, terminal_sf  # frequency 0 with no-action sub-frames

  def act(self, o: Dict) -> Dict:  # choose action based on observation
    a = collections.OrderedDict(
      freq=self.freq_sf[o['time_step']][0],   # frequency for current time-step observation
      subframes=self.freq_sf[o['time_step']][1].tolist()  # sub-frames role for current time-step observation
    )
    return a  # return base-station action

  def episode_reset(self, o: Dict, episode_i: int) -> Dict:  # reset agent for a new episode
    return self.act(o)  # choose action based on initial observation

In [None]:
class TDMAJammerAgent:

  def __init__(self, gym_env: MultiAgentTDMAGym):
    self.act_space = gym_env.action_space['jammer']  # jammer action space
    self.obs_space = gym_env.observation_space['jammer']  # jammer observation space
    self.ul_jamming_act = gym_env.UPLINK_ACTION  # jamming up-link action value
    self.dl_jamming_act = gym_env.DOWNLINK_ACTION  # jamming down-link action value
    self.no_jamming_act = gym_env.NO_ACTION  # no jamming action value
    self.n_subframes = len(self.act_space['subframes'].nvec)  # number of subframes

  def act(self, o: Dict, eps: Optional[float] = None) -> Dict:  # choose jamming action
    rand_sf_jamming = random.sample(
      range(self.n_subframes), k=self.n_subframes // 2  # select which sub-frames half will be jam 
    )
    sf_act = []
    for i in range(self.n_subframes):  # for all sub-frames
      if i in rand_sf_jamming:  # sub-frames selected for jamming
        sf_act.append(
          random.choice([self.ul_jamming_act, self.dl_jamming_act])  # randomly choose between up-link and down-link jamming
        )
      else:  # sub-frame is not selected for jamming
        sf_act.append(self.no_jamming_act)  # no jamming action
    a = collections.OrderedDict(
      freq=randint(0, self.act_space['freq'].n - 1),  # choose random frequency to do jamming
      subframes=sf_act
    )
    return a  # random jamming action

  def episode_reset(self, o: Dict, episode_i: int) -> Dict:  # reset agent for a new episode
    return self.act(o)  # choose jamming action

In [None]:
def build_epsilon_fn(steps: int) -> Callable[[int], float]:  # epsilon function builder
  log_epsilon_space = torch.logspace(1.0, -1.0, steps=steps, base=10.0) / 10.0  # epsilon log-space {1.0, ..., 0.0}

  def epsilon_fn(episode_i: int):  # epsilon function
    if episode_i >= steps:  # max step reached
      return 0.0
    return log_epsilon_space[episode_i].item()  # return epsilon value

  return epsilon_fn  # return our function as an callable object

In [None]:
class TDMAUserAgent:

  def __init__(
      self, lr: float, gamma: float, gym_env: MultiAgentTDMAGym,
      eps_fn_steps: int
  ):
    self.act_space = gym_env.action_space['user']  # user action space from gym environment
    self.obs_space = gym_env.observation_space['user']  # user observation space

    self.lr = lr  # learning rate
    self.gamma = gamma  # gamma parameter
    self.qtable: Dict[str, Tensor] = dict()  # Q-table with observation encoded as text string as keys for a 3D Q-table
    self.qtable_dims: Tuple[int, int, int] = (   # 3D Q-table dimensions (n_freqs X n_subframes x n_roles)
      self.act_space['freq'].n,
      self.act_space['subframes'].nvec.size,
      self.act_space['subframes'].nvec.max()
    )

    self.eps_fn_steps = eps_fn_steps  # number of steps (episodes) in epsilon log-space
    self.eps_fn: Callable[[int], float] = build_epsilon_fn(eps_fn_steps)  # build our epsilon function
    self.episode_i: Optional[int] = None  # keep track of the episode number (e.g. for epsilon)
    self.eps: Optional[float] = None  # epsilon value as a property of our agent

  def act(self, o: Dict, eps: Optional[float] = None) -> Dict:  # choose action
    if eps is None:
      eps = self.eps  # not epsilon as argument, use agent's current epsilon value

    o_enc = self.encode_obs(o)  # encode obs to a string "{time_step}:{freq}:{subframes}:{successes}"
    if o_enc not in self.qtable:
      self.qtable[o_enc] = torch.zeros(self.qtable_dims)  # add new Q-table for new obs, Q-values are zero

    if random.uniform(1.0, 0.0) < eps:  # epsilon-greedy condition for exploration
      return self.explore(o)  # exploring random action

    max_sf_values, max_sf_indices = self.qtable[o_enc].max(dim=2)  # get values and index of best actions for all sub-frame Q-values
    max_freq = max_sf_values.sum(dim=1).argmax()  # get frequency with the best Q-values for all sub-frames

    if max_sf_indices.ndimension() > 1:  # multiple equal maximums
      max_sf_indices = max_sf_indices[max_freq]  # if Q-value equality occurs, take random one

    return collections.OrderedDict(
      freq=int(max_freq), subframes=max_sf_indices.tolist(),  # create and return action with best Q-values
    )

  def explore(self, o: Dict) -> Dict:  # action-space exploration
    rand_a = collections.OrderedDict(
      freq=randint(0, self.act_space['freq'].n - 1),  # random frequency
      subframes=[
        randint(0, n - 1) for n in self.act_space['subframes'].nvec  # random role action for each sub-frame
      ]
    )
    return rand_a  # random action

  def update(
      self, o: Dict, a: Dict, r: float, o_prime: Dict  # agent update function (e.g. Q-learning update)
  ) -> float:
    o_enc = self.encode_obs(o)  # encode obs to a string "{time_step}:{freq}:{subframes}:{successes}"
    o_prime_enc = self.encode_obs(o_prime)  # encode obs to a string "{time_step}:{freq}:{subframes}:{successes}"

    if o_enc not in self.qtable:
      self.qtable[o_enc] = torch.zeros(self.qtable_dims)  # add new Q-table for new obs, Q-values are zero

    if o_prime_enc not in self.qtable:
      self.qtable[o_prime_enc] = torch.zeros(self.qtable_dims)  # add new Q-table for new obs, Q-values are zero

    a_inds = a['freq'], range(len(a['subframes'])), a['subframes']  # get Q-table indices of action
    a_vals = self.qtable[o_enc][a_inds]  # TD-source (e.i. TD-error = [TD-target - TD-source]), also old Q-values to compute delta update

    max_prime_sf_vals, _ = self.qtable[o_prime_enc].max(dim=2)  # get values and index of best actions for all sub-frame Q-values
    a_prime_val = max_prime_sf_vals[max_prime_sf_vals.sum(dim=1).argmax()]  # get frequency with the best Q-values for all sub-frames

    last_success = torch.tensor(o_prime['last_success'])  # last success for credit assignment of the update

    td_target = (r + self.gamma * a_prime_val) * last_success  # compute Temporal-Difference target (e.i [r + gamma * Q(o', a_max)])
    self.qtable[o_enc][a_inds] += (
        self.lr * (td_target - a_vals) * last_success  # update Q-table (e.i. Q(o, s) <- Q(o, s) + lr * [TD-target - Q(o, a)])
    )

    delta_update = float(sum(self.qtable[o_enc][a_inds] - a_vals))  # compute delta update
    return delta_update  # return delta update to training loop

  def episode_reset(self, o: Dict, episode_i: int) -> Dict:  # reset agent for a new episode
    self.episode_i = episode_i  # current episode number
    self.eps = self.eps_fn(episode_i)  # epsilon w.r.t. episode number
    return self.act(o, self.eps)  # act upon initial observation

  def save_to_file(self, path: str, overwrite: bool):  # save agent state to file
    file = pathlib.Path(path)  # file path
    if not overwrite and file.exists():  # exception if file exist and overwrite is false
      raise FileExistsError()

    # agent state as a python dictionary
    agent_state: Dict[str, Any] = {
      'gamma': self.gamma,
      'act_space': {
        'n_freq': self.act_space['freq'].n,
        'subframes_nvec': [
          int(n) for n in self.act_space['subframes'].nvec
        ]
      },
      'obs_space': {
        'n_time_step': self.obs_space['time_step'].n
      },
      'qtable': {k: v.tolist() for k, v in self.qtable.items()}
    }

    with file.open('w') as f:  # open file to write in it
      json.dump(agent_state, f)  # write agent state in file

  def load_from_file(self, path: str):  # load agent state from compatible saved state
    file = pathlib.Path(path)  # file path
    if not file.exists():
      return  # file does not exists

    with file.open('r') as f:  # open file to read it
      agent_state = json.load(f)  # load agent state from file

    self.gamma = agent_state['gamma']  # set gamma to loaded value
    self.qtable = {
      k: torch.tensor(v) for k, v in agent_state['qtable'].items()  # set number of actions from loaded value
    }

    assert self.act_space['freq'].n == agent_state['act_space']['n_freq']  # assert compatiblity
    assert [int(n) for n in self.act_space['subframes'].nvec] == (
      agent_state['act_space']['subframes_nvec']  # assert compatiblity
    )
    assert self.obs_space['time_step'].n == (
      agent_state['obs_space']['n_time_step']  # assert compatiblity
    )

  @staticmethod
  def encode_obs(o: Dict) -> str:  # encoding observation to a string to use it as Q-table key
    encoded = '{time_step}:{last_a}:{last_success}'.format(
      time_step=o['time_step'] if 'time_step' in o else '',
      last_a=(
        TDMAUserAgent.encode_act(o['last_a']) if 'last_a' in o else ''
      ),
      last_success=o['last_success'] if 'last_success' in o else ''
    ).replace(' ', '')
    return encoded  # return encoded observation as string

  @staticmethod
  def encode_act(a: Dict) -> str:  # encoding action to a string to use it as part of Q-table key
    encoded = '{freq}:{sf}'.format(
      freq=a['freq'], sf=a['subframes']
    ).replace(' ', '')
    return encoded  # return encoded action as string

In [None]:
random.seed(1234)  # python random number generator seed
torch.manual_seed(1234)  # pytorch random number generator seed

n_freqs = 10  # number of frequencies
n_subframes = 8  # number of sub-frames within a time-step frame
horizon = 15  # horizon of our gym (episodes)

gym_env = MultiAgentTDMAGym(
  n_freqs, n_subframes, horizon, use_jammer=False  # our MARL gym environment
)

lr = 0.15  # learning rate
gamma = 0.0  # gamma parameter
eps_fn_steps = 3000  # number of steps (episodes) in epsilon log-space

# contain all agents inside a key-value dictionary
agents = {
  'user': TDMAUserAgent(lr, gamma, gym_env, eps_fn_steps),  # initialize user agent
  'bs': TDMABaseStationAgent(gym_env),  # initialize base-station agent
  'jam': TDMAJammerAgent(gym_env)  # initialize jammer agent
}

# agents['user'].load_from_file('tdma_user_agent_1.json')

show_verbose = False  # print information for debugging
render_gym = False  # render gym

# print simulation parameters
print(
  'model: {{n_freqs={n_freqs}, n_subframes:{n_subframes},'
  ' horizon={horizon}}}, '
  'agent: {{lr={lr}, gamma={gamma}, eps_fn_steps={eps_fn_steps}}}'.format(
    n_freqs=n_freqs, n_subframes=n_subframes, horizon=horizon, lr=lr,
    gamma=gamma, eps_fn_steps=eps_fn_steps
  )
)

running_len = 5  # length of our running training data
running_delta = []  # running delta (e.g. the last running_len delta update)
running_acc = []  # running accuracy (e.g. the last running_len accuracy)

for episode_i in count():  # user agent training loop
  if show_verbose:
    print('starting episode {episode_i}...'.format(episode_i=episode_i))

  delta_update: List[float] = []  # delta update of our Q-table
  n_successes: int = 0  # number of optimal actions (actions with maximum reward)
  cumul_r: float = 0.0  # cumulative reward

  o = gym_env.reset()  # reset gym for new episode
  a = collections.OrderedDict(
    base_station=agents['bs'].episode_reset(o['base_station'], episode_i),  # reset base-station agent
    user=agents['user'].episode_reset(o['user'], episode_i),  # reset user agent with initial observation
    jammer=agents['jam'].episode_reset(o['jammer'], episode_i)  # reset jammer agent
  )

  for t in count():  # episode loop
    if render_gym:
      gym_env.render()  # show gym rendering
    o_prime, r, done, _ = gym_env.step(a)  # interact with the gym, get environment transition
    if done:
      break  # terminal gym observation reached, out of horizon

    delta_update.append(
      agents['user'].update(
        o['user'], a['user'], r['user'], o_prime['user']  # update user agent with transition, get delta update
      )
    )
    cumul_r += r['user']  # add user reward to cumulative user reward
    n_successes += int(r['user'] == 1.0 * n_subframes)  # success if optimal action-reward of 1.0 * number of sub-frames

    if show_verbose:  # show transition of our model (e.i. <o, a, r, o'>)
      print(
        'user_transition={{o={o}, a={a}, r={r}, o_prime={o_prime}}},'
        ' delta_update={delta}'.format(
          o=agents['user'].encode_obs(o['user']),
          a=agents['user'].encode_act(a['user']),
          r=r['user'],
          o_prime=agents['user'].encode_obs(o_prime['user']),
          delta=delta_update[-1]
        )
      )

    o = o_prime  # increment to next observation (e.g. next observation)
    a = collections.OrderedDict(
      base_station=agents['bs'].act(o['base_station']),  # base-station acts upon next observation
      user=agents['user'].act(o['user']),  # user acts upun next observation
      jammer=agents['jam'].act(o['jammer'])  # jammer acts upun next observation
    )

  # strip running data because we reached running length
  if len(running_acc) >= running_len or len(running_delta) >= running_len:
    running_acc.pop(0)  # delete oldest running accuracy
    running_delta.pop(0)  # delete oldest running update delta

  running_acc.append(n_successes / horizon)  # add latest accuracy to running data
  running_delta.append(sum(delta_update))  # add latest update delta to running data

  # show episode results
  print(
    'episode {episode_i}: cumul_reward={cumul_r}, accuracy:{acc:0.5}, '
    'cumul_delta={cumul_delta:0.5}, eps={eps:0.5}'.format(
      episode_i=episode_i, cumul_r=cumul_r, acc=running_acc[-1],
      cumul_delta=running_delta[-1], eps=agents['user'].eps
    )
  )

  # training stop conditions
  if (
      all([acc == 1.0 for acc in running_acc])  # all running accuracy are maximized
      and all([delta < 0.0001 for delta in running_delta])  # all running delta update a lower than 0.0001
      and episode_i >= running_len  # running data have reached running length
  ) or episode_i >= eps_fn_steps + running_len:  # epsilon was 0 for all running data (nothing will change)
    break  # exit training loop

agents['user'].save_to_file('tdma_user_agent_1.json', overwrite=True)  # save agent state to file
gym_env.close()  # close gym environment

model: {n_freqs=10, n_subframes:8, horizon=15}, agent: {lr=0.15, gamma=0.0, eps_fn_steps=3000}
episode 0: cumul_reward=0.0, accuracy:0.0, cumul_delta=0.0, eps=1.0
episode 1: cumul_reward=10.0, accuracy:0.0, cumul_delta=5.1, eps=0.99847
episode 2: cumul_reward=3.0, accuracy:0.0, cumul_delta=1.35, eps=0.99693
episode 3: cumul_reward=6.0, accuracy:0.0, cumul_delta=3.0, eps=0.9954
episode 4: cumul_reward=0.0, accuracy:0.0, cumul_delta=0.0, eps=0.99388
episode 5: cumul_reward=0.0, accuracy:0.0, cumul_delta=0.0, eps=0.99235
episode 6: cumul_reward=0.0, accuracy:0.0, cumul_delta=0.0, eps=0.99083
episode 7: cumul_reward=2.0, accuracy:0.0, cumul_delta=0.3, eps=0.98931
episode 8: cumul_reward=5.0, accuracy:0.0, cumul_delta=2.235, eps=0.98779
episode 9: cumul_reward=9.0, accuracy:0.0, cumul_delta=4.35, eps=0.98627
episode 10: cumul_reward=0.0, accuracy:0.0, cumul_delta=0.0, eps=0.98476
episode 11: cumul_reward=3.0, accuracy:0.0, cumul_delta=1.35, eps=0.98325
episode 12: cumul_reward=0.0, accuracy