**SHREYA AKURATHI**
**200968188**
**WEEK 5**

In [1]:
pip install tf-agents

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import tensorflow as tf
import tensorflow_probability as tfp
from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import random_tf_policy
from tf_agents.drivers import dynamic_step_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks.q_network import QNetwork
from tf_agents.utils import common
import numpy as np


Exercise 1 -Create a environment  

1.  for  which  the  observation  is  a  random  integer  between -5and 5,  there  are  3 possible actions (0, 1, 2), and the reward is the product of the action and the observation.
2. Define anoptimal policy manually. The action only depends on the sign of the observation, 0 when is negative and 2 when is positive.
3. Request  for 50observations  from  the  environment, compute  and  print the total reward.

In [3]:
class BanditPyEnvironment(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(shape=(), dtype=int, minimum=0, maximum=2, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(shape=(1,), dtype=int, minimum=-5, maximum=5, name='observation')
        self._state = None
        self._episode_ended = False
        self._observation = None
        self._reward = None
    
    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec
    
    def _reset(self):
        self._episode_ended = False
        self._observation = np.random.randint(low=-5, high=6)
        self._reward = 0
        return ts.restart(np.array(self._observation, dtype=np.int32))
    
    def _step(self, action):
      if self._episode_ended:
        return self.reset()

      self._reward = self._observation * action
      self._episode_ended = True
      return ts.termination(np.array(self._observation, dtype=np.int32), reward=self._reward)



In [4]:
env = BanditPyEnvironment()

In [5]:
time_step = env.reset()
print(time_step)

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array(-5, dtype=int32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})


In [6]:
action = tf.constant(1, dtype=tf.int32)
next_time_step = env.step(action)
print(next_time_step)

TimeStep(
{'discount': array(0., dtype=float32),
 'observation': array(-5, dtype=int32),
 'reward': array(-5., dtype=float32),
 'step_type': array(2, dtype=int32)})


In [7]:
def optimal_policy(observation):
    if observation < 0:
        return tf.constant(0, dtype=tf.int32)
    else:
        return tf.constant(2, dtype=tf.int32)

In [10]:
total_reward = 0.0 
time_step = env.reset()
for _ in range(50):
    action = optimal_policy(time_step.observation)
    time_step = env.step(action)
    total_reward += time_step.reward

print('Total reward:', total_reward)  


Total reward: 76.0


3.Exercise 2 –Create an environment 


1. Define an environment will either always give reward = observation * action or reward = -observation * action. This will be decided when the environment is initialized.

2. Define a policy that detects the behavior of the underlying environment. There are three situations that the policy needs to handle:

> i.The agent has not detected know yet which version of the environment is running.ii.The  agent  detected  that  the  original  version  of  the  environment  is running.iii.The  agent  detected  that  the  flipped  version  of  the  environment  is running

3. Define the agent that detects the sign of the environment and sets the policy appropriately





In [11]:
class CustomEnvironment(py_environment.PyEnvironment):
    def __init__(self, reward_sign):
        self._action_spec = array_spec.BoundedArraySpec(shape=(), dtype=int, minimum=0, maximum=2, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(shape=(1,), dtype=int, minimum=-5, maximum=5, name='observation')
        self._state = None
        self._episode_ended = False
        self._observation = None
        self._reward_sign = reward_sign

    
    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec
    
    def _reset(self):
      self._episode_ended = False
      self._observation = np.random.randint(low=-5, high=6)
      self._reward = 0
      return ts.restart(np.array(self._observation, dtype=np.int32))
    
    def _step(self, action):
        if self._episode_ended:
            return self.reset()
        if self._reward_sign == 'original':
          self._reward = self._observation * action
        else:
          self._reward = -self._observation * action

        self._episode_ended = True
        return ts.termination(np.array(self._observation, dtype=np.int32), reward=self._reward)

        


In [12]:
class Policy:
  def __init__(self):
    self._state = 'unknown'

  def get_action(self, observation):
    if self._state == 'unknown':
      if observation >= 0:
        self._state = 'original'
        return 2
      else:
        self._state = 'flipped'
        return 0
    elif self._state == 'original':
      return 2
    else:
      return 0

In [13]:
class Agent:
  def __init__(self):
    self._policy = Policy()

  def update_policy(self, reward_sign):
    if reward_sign == 'original':
      self._policy._state = 'original'
    else:
      self._policy._state = 'flipped'

  def get_action(self, observation):
    return self._policy.get_action(observation)

In [15]:
reward_env = CustomEnvironment(reward_sign='original')
agent = Agent()

total_reward = 0
for i in range(50):
    observation = reward_env.reset().observation
    agent.update_policy('original')
    action = agent.get_action(observation)
    time_step = reward_env.step(action)
    total_reward += time_step.reward
print("Total reward: ", total_reward)


Total reward:  4.0


The total reward should be positive since the optimal policy will result in positive rewards for positive observations and negative rewards for negative observations.

In [17]:
reward_env = CustomEnvironment(reward_sign='flipped')
agent = Agent()

total_reward = 0
for i in range(50):
    observation = reward_env.reset().observation
    agent.update_policy('flipped')
    action = agent.get_action(observation)
    time_step = reward_env.step(action)
    total_reward += time_step.reward
print("Total reward: ", total_reward)

Total reward:  0.0


This defines an environment where the reward is always 0, regardless of the action and observation. Therefore, the total reward in this environment should always be 0.
