In [None]:
#   ____                            _____     _____                   __          __        _        _                 
#  / __ \                     /\   |_   _|   / ____|                  \ \        / /       | |      | |                
# | |  | |_ __   ___ _ __    /  \    | |    | |  __ _   _ _ __ ___     \ \  /\  / /__  _ __| | _____| |__   ___  _ __  
# | |  | | '_ \ / _ \ '_ \  / /\ \   | |    | | |_ | | | | '_ ` _ \     \ \/  \/ / _ \| '__| |/ / __| '_ \ / _ \| '_ \ 
# | |__| | |_) |  __/ | | |/ ____ \ _| |_   | |__| | |_| | | | | | |     \  /\  / (_) | |  |   <\__ \ | | | (_) | |_) |
#  \____/| .__/ \___|_| |_/_/    \_\_____|   \_____|\__, |_| |_| |_|      \/  \/ \___/|_|  |_|\_\___/_| |_|\___/| .__/ 
#        | |                                         __/ |                                                      | |    
#        |_|                                        |___/                                                       |_|    

In [None]:
!pip install gym
#!pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html



In [None]:
import json
from itertools import count

import gym
from gym import spaces

import torch
from torch import Tensor

import random
from random import randint

from typing import Dict, List, Optional, Callable, Any, Tuple, Union, Type

import pathlib

In [None]:
class RxQLearningAgent:
  def __init__(
    self, lr: float, gamma: float, act_space: spaces.Discrete,
    obs_space: spaces.Discrete, eps_fn_steps: int
  ):
    assert obs_space.dtype == int  # compatibility assertion

    self.lr = lr  # learning rate
    self.gamma = gamma  # gamma parameter
    self.qtable: Dict[int, Tensor] = dict()  # Q-table with time-step as keys for a 1D Q-table
    self.n_actions = act_space.n  # number of actions

    self.eps_fn_steps = eps_fn_steps  # number of steps (episodes) in epsilon log-space
    self.eps_fn: Callable[[int], float] = build_epsilon_fn(eps_fn_steps)  # build our epsilon function
    self.episode_i: Optional[int] = None  # keep track of the episode number (e.g. for epsilon)
    self.eps: Optional[float] = None  # epsilon value as a property of our agent

  def act(self, o: int, eps: Optional[float] = None) -> int:  # function to choose action, with optinal epsilon argument
    if eps is None:
      eps = self.eps  # not epsilon as argument, use agent's current epsilon value

    if o not in self.qtable:
      self.qtable[o] = torch.zeros(self.n_actions)  # add new Q-table for new obs, Q-values are zero

    if random.uniform(1.0, 0.0) < eps:  # epsilon-greedy condition for exploration
      return self.explore(o)  # exploring random action

    return self.qtable[o].argmax().item()  # return frequency with highest Q-value

  def explore(self, o: int) -> int:  # action-space exploration
    return randint(0, self.n_actions - 1)  # random action

  def update(self, o: int, a: int, r: float, o_prime: int) -> float:  # agent update function (e.g. Q-learning update)
    if o not in self.qtable:
      self.qtable[o] = torch.zeros(self.n_actions)  # add new Q-table for new obs, Q-values are zero

    if o_prime not in self.qtable:
      self.qtable[o_prime] = torch.zeros(self.n_actions)  # add new Q-table for new obs, Q-values are zero

    old_o_a_value = self.qtable[o][a].item()  # keep old Q-values to compute delta update

    td_target = r + self.gamma * self.qtable[o_prime].max().item()  # compute Temporal-Difference target (e.i [r + gamma * Q(o', a_max)])
    self.qtable[o][a] += self.lr * (td_target - self.qtable[o][a].item())  # update Q-table (e.i. Q(o, s) <- Q(o, s) + lr * [TD-target - Q(o, a)])

    return self.qtable[o][a].item() - old_o_a_value  # return delta update to training loop

  def episode_reset(self, o: int, episode_i: int) -> int:  # reset agent for a new episode
    self.episode_i = episode_i  # current episode number
    self.eps = self.eps_fn(episode_i)  # epsilon w.r.t. episode number
    return self.act(o, self.eps)  # act upon initial observation

  def save_to_file(self, path: str, overwrite: bool = False):  # save agent state to file
    file = pathlib.Path(path)  # file path
    if not overwrite and file.exists():  # exception if file exist and overwrite is false
      raise FileExistsError()

    # agent state as a python dictionary
    agent_state: Dict[str, Any] = {
      'gamma': self.gamma, 'n_actions': self.n_actions,
      'qtable': {k: v.tolist() for k, v in self.qtable.items()}
    }

    with file.open('w') as f:  # open file to write in it
      json.dump(agent_state, f)  # write agent state in file

  def load_from_file(self, path: str):  # load agent state from compatible saved state
    file = pathlib.Path(path)  # file path
    if not file.exists():
      return  # file does not exists

    with file.open('r') as f:  # open file to read it
      agent_state = json.load(f)  # load agent state from file

    self.gamma = agent_state['gamma']  # set gamma to loaded value
    self.n_actions = agent_state['n_actions']  # set number of actions from loaded value
    # set Q-table from loaded values
    self.qtable = {
      int(k): torch.tensor(v) for k, v in agent_state['qtable'].items()
    }

In [None]:
class TxWirelessGym(gym.Env):
  def __init__(self, n_freqs: int, horizon: int):
    super(TxWirelessGym, self).__init__()  # initialize gym.Env base class
    self.action_space: spaces.Discrete = spaces.Discrete(n_freqs)  # action space {0, 1, ..., n_freqs - 1}
    self.observation_space: spaces.Discrete = spaces.Discrete(horizon + 1)  # observation space {0, 1, ..., horizon}
    self.hor = horizon  # gym horizon to know when we are DONE
    self.tx_freq = torch.randint(n_freqs, size=(horizon,))  # random Tx frequencies w.r.t. time-step
    self.t = 0  # initial time-step / observation 

  def step(self, action: int):  # step function to interact with the gym
    if self.t < self.hor:  # non-terminal observation, horizon not reached
      r = float(action == self.tx_freq[self.t])  # float(True) = 1.0 if Tx and Rx frequencies are the same
    else:  # gym horizon reached
      r = 0.0
    self.t += 1  # increment our time-step / observation
    o = self.t  # observation that will return
    done = (self.t == self.hor)  # is terminal gym state reached
    return o, r, done, {}  # gyms always returns <obs, reward, terminal obs reached, debug/info dictionary>

  def reset(self):  # reset our gym for a new episode
    self.t = 0  # initial time-step
    o = self.t  # initial observation
    return o

  def render(self, mode='human'):  # gym visual rendering (e.g. text, image, plot, 3D frame, etc.)
    a_r = []
    for freq in range(self.action_space.n):  # try every freqs
      if self.t < self.hor:
        a_r.append(float(freq == self.tx_freq[self.t]))  # float(True) = 1.0 if Tx and Rx frequencies are the same
      else:
        a_r.append(0.0)
    # print all action-reward for this time-step
    print(
      't={o:2}, tx_freq:{tx_freq}, tx_rewards:{action_rewards}'.format(
        o=self.t, tx_freq=[i for i in range(len(a_r)) if a_r[i] == 1.0],
        action_rewards=a_r
      )
    )

In [None]:
def build_epsilon_fn(steps: int) -> Callable[[int], float]:  # epsilon function builder
  log_epsilon_space = torch.logspace(1.0, -1.0, steps=steps, base=10.0) / 10.0  # epsilon log-space {1.0, ..., 0.0}

  def epsilon_fn(episode_i: int):  # epsilon function
    if episode_i >= steps:  # max step reached
      return 0.0
    return log_epsilon_space[episode_i].item()  # return epsilon value

  return epsilon_fn  # return our function as an callable object

In [None]:
random.seed(1234)  # python random number generator seed
torch.manual_seed(1234)  # pytorch random number generator seed

n_freqs = 10  # number of frequencies
horizon = 15  # horizon of our gym (episodes)

gym_env = TxWirelessGym(n_freqs, horizon)  # our gym environment

lr = 0.15  # learning rate
gamma = 0.0  # gamma parameter
eps_fn_steps = 300  # number of steps (episodes) in epsilon log-space
# initialize our Rx agent with learning parameters and gym parameters
agent = RxQLearningAgent(
  lr, gamma, gym_env.action_space, gym_env.observation_space, eps_fn_steps
)

agent.load_from_file('rx_agent_1.json')  # load previously saved agent

show_verbose = False  # print information for debugging
render_gym = False  # render gym

# print simulation parameters
print(
  'model: {{n_freqs={n_freqs}, horizon={horizon}}}, '
  'agent: {{lr={lr}, gamma={gamma}, eps_fn_steps={eps_fn_steps}}}'.format(
    n_freqs=gym_env.action_space.n, horizon=horizon, lr=lr, gamma=gamma,
    eps_fn_steps=eps_fn_steps
  )
)

running_len = 5  # length of our running training data
running_delta = []  # running delta (e.g. the last running_len delta update)
running_acc = []  # running accuracy (e.g. the last running_len accuracy)

for episode_i in count():  # training loop
  if show_verbose:
    print('starting episode {episode_i}...'.format(episode_i=episode_i))

  delta_update: List[float] = []  # delta update of our Q-table
  n_successes: int = 0  # number of optimal actions (actions with maximum reward)
  cumul_r: float = 0.0  # cumulative reward

  o = gym_env.reset()  # reset gym for new episode
  a = agent.episode_reset(o, episode_i)  # reset agent with initial gym observation and episode number

  for t in count():  # episode loop
    if render_gym:
      gym_env.render()  # show gym rendering
    o_prime, r, done, _ = gym_env.step(a)  # interact with the gym, get environment transition

    delta_update.append(agent.update(o, a, r, o_prime))  # update agent with transition, get delta update
    cumul_r += r  # add reward to cumulative reward
    n_successes += int(r > 0.0)  # success if optimal action-reward of 1.0

    if show_verbose:   # show transition of our model (e.i. <o, a, r, o'>)
      print(
        'transition=<{o}, {a}, {r}, {o_prime}>,'
        ' delta_update={delta}'.format(
          o=o, a=a, r=r, o_prime=o_prime, delta=delta_update[-1]
        )
      )

    o = o_prime  # increment to next observation (e.g. next observation)
    a = agent.act(o)  #  act upon next observation
    if done:
      break  # terminal gym observation reached, out of horizon

  # strip running data because we reached running length
  if len(running_acc) >= running_len or len(running_delta) >= running_len:
    running_acc.pop(0)  # delete oldest running accuracy
    running_delta.pop(0)  # delete oldest running update delta

  running_acc.append(n_successes / horizon)  # add latest accuracy to running data
  running_delta.append(sum(delta_update))  # add latest update delta to running data

  # show episode results
  print(
    'episode {episode_i}: cumul_reward={cumul_r}, accuracy:{acc:0.5}, '
    'cumul_delta={cumul_delta:0.5}, eps={eps:0.5}'.format(
      episode_i=episode_i, cumul_r=cumul_r, acc=running_acc[-1],
      cumul_delta=running_delta[-1], eps=agent.eps
    )
  )

  # training stop conditions
  if (
    all([acc == 1.0 for acc in running_acc])  # all running accuracy are maximized
    and all([delta < 0.0001 for delta in running_delta])  # all running delta update a lower than 0.0001
    and episode_i >= running_len  # running data have reached running length
  ) or episode_i >= eps_fn_steps + running_len:  # epsilon was 0 for all running data (nothing will change)
    break  # exit training loop

agent.save_to_file('rx_agent_1.json', overwrite=True)  # save agent state to file
gym_env.close()  # close gym environment

model: {n_freqs=10, horizon=15}, agent: {lr=0.15, gamma=0.0, eps_fn_steps=300}
episode 0: cumul_reward=1.0, accuracy:0.066667, cumul_delta=0.15, eps=1.0
episode 1: cumul_reward=1.0, accuracy:0.066667, cumul_delta=0.15, eps=0.98472
episode 2: cumul_reward=1.0, accuracy:0.066667, cumul_delta=0.15, eps=0.96967
episode 3: cumul_reward=1.0, accuracy:0.066667, cumul_delta=0.15, eps=0.95485
episode 4: cumul_reward=3.0, accuracy:0.2, cumul_delta=0.4275, eps=0.94025
episode 5: cumul_reward=1.0, accuracy:0.066667, cumul_delta=0.1275, eps=0.92588
episode 6: cumul_reward=2.0, accuracy:0.13333, cumul_delta=0.25837, eps=0.91173
episode 7: cumul_reward=1.0, accuracy:0.066667, cumul_delta=0.15, eps=0.8978
episode 8: cumul_reward=4.0, accuracy:0.26667, cumul_delta=0.54212, eps=0.88407
episode 9: cumul_reward=3.0, accuracy:0.2, cumul_delta=0.405, eps=0.87056
episode 10: cumul_reward=3.0, accuracy:0.2, cumul_delta=0.3558, eps=0.85726
episode 11: cumul_reward=2.0, accuracy:0.13333, cumul_delta=0.23587, ep