# MULTI ARMED BANDITS IN TF-AGENTS

CREATE AN ENVIRONMENT:-

a. for which the observation is a random integer between -5 and 5, there are 3
possible actions (0, 1, 2), and the reward is the product of the action and the observation.





In [1]:
pip install tf-agents


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tf-agents
  Downloading tf_agents-0.15.0-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gym<=0.23.0,>=0.17.0
  Downloading gym-0.23.0.tar.gz (624 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.4/624.4 KB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.23.0-py3-none-any.whl size=697658 sha256=f66b04c53841d8ac588e4829b97a5b6f15a38936cfc7d95df92bcfce586b9a24
  Sto

In [2]:
import abc
import numpy as np
import tensorflow as tf

from tf_agents.agents import tf_agent
from tf_agents.drivers import driver
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.policies import tf_policy
from tf_agents.specs import array_spec
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory
from tf_agents.trajectories import policy_step

nest = tf.nest

In [16]:
class MultiArmedBanditEnv(py_environment.PyEnvironment):
  def __init__(self):
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=-5, maximum=5, name='observation')
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=2, name='action')
    self._episode_ended = False
    self._observation = None
    self._reward = None

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._episode_ended = False
    self._observation = np.random.randint(low=-5, high=6)
    self._reward = 0
    return ts.restart(np.array(self._observation, dtype=np.int32))

  def _step(self, action):
    if self._episode_ended:
      return self.reset()

    self._reward = self._observation * action
    self._episode_ended = True
    return ts.termination(np.array(self._observation, dtype=np.int32), reward=self._reward)

Define an optimal policy manually. The action only depends on the sign of the observation, 0 when is negative and 2 when is positive.

In [17]:
def optimal_policy(observation):
  if observation < 0:
    return 0
  else:
    return 2

Request  for  50  observations  from  the  environment,  compute  and  print  the total reward.

In [18]:
env = MultiArmedBanditEnv()
total_reward = 0

for _ in range(50):
  time_step = env.reset()
  action = optimal_policy(time_step.observation)
  time_step = env.step(action)
  total_reward += time_step.reward

print('Total reward:', total_reward)

Total reward: 136.0


#EXERCISE 2

In [19]:
class RewardEnv(py_environment.PyEnvironment):
  def __init__(self, reward_sign):
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=-5, maximum=5, name='observation')
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=2, name='action')
    self._episode_ended = False
    self._observation = None
    self._reward = None
    self._reward_sign = reward_sign

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._episode_ended = False
    self._observation = np.random.randint(low=-5, high=6)
    self._reward = 0
    return ts.restart(np.array(self._observation, dtype=np.int32))

  def _step(self, action):
    if self._episode_ended:
      return self.reset()

    if self._reward_sign == 'original':
      self._reward = self._observation * action
    else:
      self._reward = -self._observation * action

    self._episode_ended = True
    return ts.termination(np.array(self._observation, dtype=np.int32), reward=self._reward)

Define a policy that detects the behaviorof the underlying environment. There are three situations that the policy needs to handle:<br>
i.The agent has not detected know yet which version of the environment is running.<br>ii.The  agent  detected  that  the  original  version  of  the  environment  is running.<br>iii.The  agent  detected  that  the  flipped  version  of  the  environment  is running

In [20]:
class Policy:
  def __init__(self):
    self._state = 'unknown'

  def get_action(self, observation):
    if self._state == 'unknown':
      if observation >= 0:
        self._state = 'original'
        return 2
      else:
        self._state = 'flipped'
        return 0
    elif self._state == 'original':
      return 2
    else:
      return 0

Define the agent that detects the sign of the environment and sets the policy appropriately.

In [21]:
class Agent:
  def __init__(self):
    self._policy = Policy()

  def update_policy(self, reward_sign):
    if reward_sign == 'original':
      self._policy._state = 'original'
    else:
      self._policy._state = 'flipped'

  def get_action(self, observation):
    return self._policy.get_action(observation)