In [None]:
#   ____                            _____     _____                   __          __        _        _                 
#  / __ \                     /\   |_   _|   / ____|                  \ \        / /       | |      | |                
# | |  | |_ __   ___ _ __    /  \    | |    | |  __ _   _ _ __ ___     \ \  /\  / /__  _ __| | _____| |__   ___  _ __  
# | |  | | '_ \ / _ \ '_ \  / /\ \   | |    | | |_ | | | | '_ ` _ \     \ \/  \/ / _ \| '__| |/ / __| '_ \ / _ \| '_ \ 
# | |__| | |_) |  __/ | | |/ ____ \ _| |_   | |__| | |_| | | | | | |     \  /\  / (_) | |  |   <\__ \ | | | (_) | |_) |
#  \____/| .__/ \___|_| |_/_/    \_\_____|   \_____|\__, |_| |_| |_|      \/  \/ \___/|_|  |_|\_\___/_| |_|\___/| .__/ 
#        | |                                         __/ |                                                      | |    
#        |_|                                        |___/                                                       |_|    

In [None]:
!pip install gym
#!pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html



In [None]:
import json
from itertools import count

import gym
from gym import spaces

import torch
from torch import Tensor

import random
from random import randint

from typing import Dict, List, Optional, Callable, Any, Tuple, Union, Type

import pathlib
import collections
import numpy as np
import pdb

![alt text](https://www.taitradioacademy.com/wp-content/uploads/2014/10/Image-25-800x450.png)

In [None]:
# <START> ------ CODE BLOCK 1

# insert TDMA gym here
class TDMAGym(gym.Env):
  NO_ACTION = 0  # no action on sub-frame
  UPLINK_ACTION = 1  # use sub-frame as up-link role (user [Tx] --> base-station [Rx])
  DOWNLINK_ACTION = 2  # use sub-frame as down-link role (base-station [Tx] --> user [Rx])

  def __init__(self, n_freqs: int, n_subframes: int, horizon: int):
    super(TDMAGym, self).__init__()  # initialize gym.Env base class
    if n_subframes % 2 != 0:  # since a link is separated in up and down link we need even number of subframe
      raise ValueError('n_subframes must be an even number')

    # action-space dict includes frequency and sub-frames action
    self.action_space = spaces.Dict(
      {
        'freq': spaces.Discrete(n_freqs),  # frequency used
        'subframes': spaces.MultiDiscrete([3] * n_subframes)  # sub-frames role actions
      }
    )
    # observation-space dict includes the time-step, last action and its sub-frames successes
    self.observation_space = spaces.Dict(
      {
        'time_step': spaces.Discrete(horizon + 1),  # include terminal state
        'last_a': self.action_space,  # last action done by the agent
        'last_success': spaces.MultiDiscrete([2] * n_subframes)  # what sub-frames received an acknowledgement
      }
    )
    self.reward_range = 0.0, float(n_subframes)  # action-reward range between 0 and the number of sub-frames

    self.freq_sf: Dict[int, Tuple[int, Tensor]] = dict()  # frequency and sub-frames role assignation w.r.t. to time-step
    for t in range(horizon):  # do not include terminal state
      freq = randint(0, n_freqs - 1)  # choose random frequency to do Tx
      ul_subframes = random.sample(range(n_subframes), k=n_subframes // 2)  # choose which sub-frames will be used as up-link
      dl_subframes = [
        sf for sf in range(n_subframes) if sf not in ul_subframes  # choose the rest of sub-frames as down-link
      ]
      sf = torch.zeros(n_subframes, dtype=torch.int)  # initialize sub-frames roles
      sf.fill_(self.NO_ACTION)  # set all sub-frames to no-action
      sf[ul_subframes] = self.UPLINK_ACTION  # set all up-link sub-frames to uplink-action value
      sf[dl_subframes] = self.DOWNLINK_ACTION  # set all down-link sub-frames to downlink-action value
      self.freq_sf[t] = freq, sf  # set sub-frames role w.r.t. time-step

    self.hor = horizon  # gym horizon to know when we are DONE
    self.t = 0  # initial time-step / observation

  def step(self, action: Dict) -> Tuple[  # step function to interact with the gym
    Dict, float, bool, Dict[str, Any]
  ]:
    if self.t >= self.hor:  # terminal state reached
      return collections.OrderedDict(), 0.0, True, {}  # return empty interaction

    assert self.action_space.contains(action)  # action is within gym action-space assertion

    # <START> ------ CODE BLOCK 1

    # fetch current frequency and sub-frames role from self.freq_sf[self.t]
    # compute sub-frame successes, increment time-step
    # create observation with successes and last action using OrderedDict 
    # compute reward based on successful sub-frames
    freq, sf = self.freq_sf[self.t]
    if freq == action['freq']: 
      sf_success = torch.eq(torch.from_numpy(action['subframes']),sf)*1.0
    else:
      sf_success = [0.0]*n_subframes
    sf_success = np.array(sf_success)

    # print('freq ={f}, sf = {s}, action = {a} , success = {su}'.format(f=freq,s=sf, su=sf_success, a=action))
    self.t += 1
    o = collections.OrderedDict()
    o['time_step'] = self.t
    o['last_a'] = action
    o['last_success'] = sf_success
    r = sum(sf_success)
    # <END> ------ CODE BLOCK 1

    assert (
        isinstance(o, collections.OrderedDict) and
        self.observation_space.contains(o) and
        isinstance(r, float) and
        self.reward_range[0] <= r <= self.reward_range[1]
    )
    return o, r, False, {}   # gyms always returns <obs, reward, if terminal obs reached, debug/info dictionary>

  def reset(self) -> Dict:  # reset our gym for a new episode
    
    # <START> ------ CODE BLOCK 2
    
    # reset gym for new episode and create initial observation
    # with empty last action and last successed (all values are zero)
    self.t = 0 #restting time to zero
    o = collections.OrderedDict()
    #Note that you cannot update an orderd dict as o = {'key':value}, this will create a new dict and change type
    #To update an existing ordered dict, use dict_name['key'] = value 
    last_success =[0,0,0,0]
    last_a = {'freq': 0, 'subframes':[0,0,0,0]}
    o['time_step'] = 0
    o['last_a']=last_a
    o['last_success']=last_success  
    # <END> ------ CODE BLOCK 2

    assert (
        isinstance(o, collections.OrderedDict) and
        self.observation_space.contains(o))
    return o

  def render(self, mode='human'):  # gym visual rendering (e.g. text, image, plot, 3D frame, etc.)
    if self.t < self.hor:  # terminal gym state not reached
      # show current time-step, frequency and sub-frames role
      print(
        'rendering={{time_step={time_step}, freq={freq}, sf={sf}}}'
          .format(
          time_step=self.t, freq=self.freq_sf[self.t][0],
          sf=str(self.freq_sf[self.t][1].tolist()).replace(' ', '')
        )
      )
    else:  # terminal gym state reached and out of horizon
      print('rendering={terminal state reached, out of horizon}')

# <END> -------- CODE BLOCK 1

In [None]:
def build_epsilon_fn(steps: int) -> Callable[[int], float]:  # epsilon function builder
  log_epsilon_space = torch.logspace(1.0, -1.0, steps=steps, base=10.0) / 10.0  # epsilon log-space {1.0, ..., 0.0}

  def epsilon_fn(episode_i: int):  # epsilon function
    if episode_i >= steps:  # max step reached
      return 0.0
    return log_epsilon_space[episode_i].item()  # return epsilon value

  return epsilon_fn  # return our function as an callable object

In [None]:
class TDMAQLearningAgent:

  def __init__(
      self, lr: float, gamma: float, act_space: spaces.Dict,
      obs_space: spaces.Dict, eps_fn_steps: int
  ):
    self.act_space = act_space  # action-space from the gym environment
    self.obs_space = obs_space  # observation-space from the gym environment

    self.lr = lr  # learning rate
    self.gamma = gamma  # gamma parameter

    # <START> ------ CODE BLOCK 2.A
    
    # initialize the q-table where dictionary keys are string from encoded obs
    # and values are 3 dimensional tensors (3D Q-Table) 
    # The initialization of the Q-table must be implemented using observation 
    # as key and a 3D Q-table as value (code block 2.A module 6), e.i. Q-table 
    # dimensions are [n_frequencies X n_subframes X n_subframe_actions]
    self.qtable : Dict[str,Tensor] = dict()
    self.qtable_dims : Tuple[int, int, int] = (
        self.act_space['freq'].n, self.act_space['subframes'].nvec.size, self.act_space['subframes'].nvec.max()
    )
    # <END> -------- CODE BLOCK 2.A

    self.eps_fn_steps = eps_fn_steps  # number of steps (episodes) in epsilon log-space
    self.eps_fn: Callable[[int], float] = build_epsilon_fn(eps_fn_steps)  # build our epsilon function
    self.episode_i: Optional[int] = None  # keep track of the episode number (e.g. for epsilon)
    self.eps: Optional[float] = None  # epsilon value as a property of our agent

  def act(self, o: Dict, eps: Optional[float] = None) -> Dict:  # choose action
    if eps is None:
      eps = self.eps  # not epsilon as argument, use agent's current epsilon value

    o_enc = self.encode_obs(o)  # encode obs to a string "{time_step}:{freq}:{subframes}:{successes}"

    # <START> ------ CODE BLOCK 2.B 
    
    # choose exploration with epsilon greedy condition and choose action with
    # highest Q-value for exploitation
    # The act function must be implemented using epsilon-greedy for exploration 
    # and the high sub-frame action value for each sub-frames (code block 2.B module 6)
    if o_enc not in self.qtable:
      self.qtable[o_enc] = torch.zeros(self.qtable_dims)

    if random.random() < eps:
      a  = self.explore(o)
    else:

      vals, inds = self.qtable[o_enc].max(dim=2)
      vv, _ = vals.max(dim=1)
      freq = vv.argmax()
      #assert (freq.ndimention() == 1)
      a = collections.OrderedDict()
      a['freq'] = freq.item()
      a['subframes'] = np.array(inds[freq])

    # <END> -------- CODE BLOCK 2.B
    #pdb.set_trace()
    assert (
      isinstance(a, collections.OrderedDict) and
      self.act_space.contains(a)

    )
    return a

  def explore(self, o: Dict) -> Dict:  # action-space exploration
    # <START> ------ CODE BLOCK 2.C
    
    # choose random action as exploration mechanism
    a = collections.OrderedDict(
        freq=self.act_space['freq'].sample(),
        subframes = self.act_space['subframes'].sample()
    )

    # <END> ------ CODE BLOCK 2.C
    assert (
      isinstance(a, collections.OrderedDict) and
      self.act_space.contains(a)
    )
    return a  # random action

  def update(
      self, o: Dict, a: Dict, r: float, o_prime: Dict  # agent update function (e.g. Q-learning update)
  ) -> float:
    o_enc = self.encode_obs(o)  # encode obs to a string "{time_step}:{freq}:{subframes}:{successes}"
    o_prime_enc = self.encode_obs(o_prime)  # encode obs to a string "{time_step}:{freq}:{subframes}:{successes}"

    # <START> ------ CODE BLOCK 2.D
    
    # update the Q-table with TD-error and compute delta-update
    if o_enc not in self.qtable:
      self.qtable[o_enc] = torch.zeros(self.qtable_dims)  

    if o_prime_enc not in self.qtable:
      self.qtable[o_prime_enc] = torch.zeros(self.qtable_dims) 

    old_q_value_3d = self.qtable[o_enc]
    q_ind = a['freq'], range(len(a['subframes'])), a['subframes']
    old_q_val = old_q_value_3d[q_ind]
    
    q_prime_val, _ = self.qtable[o_prime_enc].max(dim=2) 
    action_prime  = q_prime_val[q_prime_val.sum(dim=1).argmax()]

    last_success = torch.Tensor(o_prime['last_success'])


    td_target = (r + self.gamma * action_prime) * last_success  
    
    self.qtable[o_enc][q_ind] += self.lr * (td_target - old_q_val) * last_success      

    delta_update = float(sum(self.qtable[o_enc][q_ind] - old_q_val))




    # <END> -------- CODE BLOCK 2.D
    assert isinstance(delta_update, float)
    return delta_update  # return delta update to training loop

  def episode_reset(self, o: Dict, episode_i: int) -> Dict:  # reset agent for a new episode
    self.episode_i = episode_i  # current episode number
    self.eps = self.eps_fn(episode_i)  # epsilon w.r.t. episode number
    return self.act(o, self.eps)  # act upon initial observation

  def save_to_file(self, path: str, overwrite: bool):  # save agent state to file
    file = pathlib.Path(path)  # file path
    if not overwrite and file.exists():  # exception if file exist and overwrite is false
      raise FileExistsError()

    # agent state as a python dictionary
    agent_state: Dict[str, Any] = {
      'gamma': self.gamma,
      'act_space': {
        'n_freq': self.act_space['freq'].n,
        'subframes_nvec': [
          int(n) for n in self.act_space['subframes'].nvec
        ]
      },
      'obs_space': {
        'n_time_step': self.obs_space['time_step'].n
      },
      'qtable': {k: v.tolist() for k, v in self.qtable.items()}
    }

    with file.open('w') as f:  # open file to write in it
      json.dump(agent_state, f)  # write agent state in file

  def load_from_file(self, path: str):  # load agent state from compatible saved state
    file = pathlib.Path(path)  # file path
    if not file.exists():
      return  # file does not exists

    with file.open('r') as f:  # open file to read it
      agent_state = json.load(f)  # load agent state from file

    self.gamma = agent_state['gamma']  # set gamma to loaded value
    self.qtable = {
      k: torch.tensor(v) for k, v in agent_state['qtable'].items()  # set number of actions from loaded value
    }

    assert self.act_space['freq'].n == agent_state['act_space']['n_freq']  # assert compatiblity
    assert [int(n) for n in self.act_space['subframes'].nvec] == (
      agent_state['act_space']['subframes_nvec']    # assert compatiblity
    )
    assert self.obs_space['time_step'].n == (
      agent_state['obs_space']['n_time_step']    # assert compatiblity
    )

  @staticmethod
  def encode_obs(o: Dict) -> str:  # encoding observation to a string to use it as Q-table key
    encoded = '{time_step}:{last_a}:{last_success}'.format(
      time_step=o['time_step'] if 'time_step' in o else '',
      last_a=(
        TDMAQLearningAgent.encode_act(o['last_a'])
        if 'last_a' in o else ''
      ),
      last_success=o['last_success'] if 'last_success' in o else ''
    ).replace(' ', '')
    return encoded  # return encoded observation as string

  @staticmethod
  def encode_act(a: Dict) -> str:  # encoding action to a string to use it as part of Q-table key
    encoded = '{freq}:{sf}'.format(
      freq=a['freq'], sf=a['subframes']
    ).replace(' ', '')
    return encoded  # return encoded action as string

In [None]:
random.seed(1234)  # python random number generator seed
torch.manual_seed(1234)  # pytorch random number generator seed

n_freqs = 4  # number of frequencies
n_subframes = 4  # number of sub-frames within a time-step frame
horizon = 15  # horizon of our gym (episodes)

gym_env = TDMAGym(n_freqs, n_subframes, horizon)  # our gym environment

lr = 0.55  # learning rate
gamma = 0.0  # gamma parameter
eps_fn_steps = 1500  # number of steps (episodes) in epsilon log-space

# initialize our Rx agent with learning parameters and gym parameters
agent = TDMAQLearningAgent(
  lr, gamma, gym_env.action_space, gym_env.observation_space, eps_fn_steps
)
#agent.load_from_file('tdma_agent_1.json')    # load previously saved agent

show_verbose = False  # print information for debugging
render_gym = False  # render gym

# print simulation parameters
print(
  'model: {{n_freqs={n_freqs}, n_subframes:{n_subframes},'
  ' horizon={horizon}}}, '
  'agent: {{lr={lr}, gamma={gamma}, eps_fn_steps={eps_fn_steps}}}'.format(
    n_freqs=n_freqs, n_subframes=n_subframes, horizon=horizon, lr=lr,
    gamma=gamma, eps_fn_steps=eps_fn_steps
  )
)

running_len = 5  # length of our running training data
running_delta = []  # running delta (e.g. the last running_len delta update)
running_acc = []  # running accuracy (e.g. the last running_len accuracy)

for episode_i in count():  # training loop
  if show_verbose:
    print('starting episode {episode_i}...'.format(episode_i=episode_i))

  delta_update: List[float] = []  # delta update of our Q-table
  n_successes: int = 0  # number of optimal actions (actions with maximum reward)
  cumul_r: float = 0.0  # cumulative reward

  # <START> ------ CODE BLOCK 3.A
  
  # reset gym and agent
  o = gym_env.reset()
  a= agent.episode_reset( o, episode_i)

  # <END> -------- CODE BLOCK 3.A

  for t in count():  # episode loop
    if render_gym:
      gym_env.render()  # show gym rendering
    o_prime, r, done, _ = gym_env.step(a)  # interact with the gym, get environment transition
    if done:
      break  # terminal gym observation reached, out of horizon

    # <START> ------ CODE BLOCK 3.B
    delta_update.append(agent.update(o, a, r, o_prime))

    
    # update agent, delta_update to list, add reward to cumul_r,

    cumul_r += r 
    # increment n_successes if success
    n_successes += int(r == 1.0 * n_subframes )
    # <END> -------- CODE BLOCK 3.B

    if show_verbose:  # show transition of our model (e.i. <o, a, r, o'>)
      print(
        'transition={{o={o}, a={a}, r={r}, o_prime={o_prime}}},'
        ' delta_update={delta}'.format(
          o=agent.encode_obs(o), a=agent.encode_act(a),
          r=r, o_prime=agent.encode_obs(o_prime),
          delta=delta_update[-1]
        )
      )
    # <START> ------ CODE BLOCK 3.C

    # next observation becomes current observation
    o = o_prime
    # choose action based on new observation
    #print(t) 
    #if (episode_i==4 and t==0): 
     # pdb.set_trace()
    a = agent.act(o)

    # <END> -------- CODE BLOCK 3.C

  # strip running data because we reached running length
  if len(running_acc) >= running_len or len(running_delta) >= running_len:
    running_acc.pop(0)  # delete oldest running accuracy
    running_delta.pop(0)  # delete oldest running update delta

  # <START> ------ CODE BLOCK 3.D

  # add accuracy and delta-update to running data
  running_acc.append(n_successes / horizon)
  running_delta.append(sum(delta_update))
  # <END> ------- CODE BLOCK 3.D

  # show episode results
  print(
    'episode {episode_i}: cumul_reward={cumul_r}, accuracy:{acc:0.5}, '
    'cumul_delta={cumul_delta:0.5}, eps={eps:0.5}'.format(
      episode_i=episode_i, cumul_r=cumul_r, acc=running_acc[-1],
      cumul_delta=running_delta[-1], eps=agent.eps
    )
  )

  # training stop conditions
  if (
      all([acc == 1.0 for acc in running_acc])  # all running accuracy are maximized
      and all([delta < 0.0001 for delta in running_delta])  # all running delta update a lower than 0.0001
      and episode_i >= running_len  # running data have reached running length
  ) or episode_i >= eps_fn_steps + running_len:  # epsilon was 0 for all running data (nothing will change)
    break  # exit training loop

agent.save_to_file('tdma_agent_1.json', overwrite=True)  # save agent state to file
gym_env.close()  # close gym environment

model: {n_freqs=4, n_subframes:4, horizon=15}, agent: {lr=0.55, gamma=0.0, eps_fn_steps=1500}
episode 0: cumul_reward=8.0, accuracy:0.066667, cumul_delta=12.1, eps=1.0
episode 1: cumul_reward=1.0, accuracy:0.0, cumul_delta=0.55, eps=0.99693
episode 2: cumul_reward=4.0, accuracy:0.0, cumul_delta=3.3, eps=0.99387
episode 3: cumul_reward=2.0, accuracy:0.0, cumul_delta=1.1, eps=0.99083
episode 4: cumul_reward=5.0, accuracy:0.0, cumul_delta=4.95, eps=0.98779
episode 5: cumul_reward=5.0, accuracy:0.0, cumul_delta=6.05, eps=0.98476
episode 6: cumul_reward=5.0, accuracy:0.0, cumul_delta=3.85, eps=0.98174
episode 7: cumul_reward=10.0, accuracy:0.0, cumul_delta=13.2, eps=0.97872
episode 8: cumul_reward=3.0, accuracy:0.0, cumul_delta=2.75, eps=0.97572
episode 9: cumul_reward=8.0, accuracy:0.0, cumul_delta=7.7, eps=0.97273
episode 10: cumul_reward=5.0, accuracy:0.0, cumul_delta=6.05, eps=0.96975
episode 11: cumul_reward=2.0, accuracy:0.0, cumul_delta=1.1, eps=0.96677
episode 12: cumul_reward=9.0, 

In [None]:
episode_i

1505