In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from itertools import product

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment, random_py_environment, random_tf_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.trajectories import time_step as ts
from tf_agents.utils import common

from tf_agents.networks import encoding_network


from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network, network
from tf_agents.networks.utils import BatchSquash
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from tf_agents.utils import common as common_utils
from tf_agents.utils import nest_utils
from tf_agents.specs import tensor_spec

import collections

import gin
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.agents import tf_agent
from tf_agents.policies import boltzmann_policy
from tf_agents.policies import epsilon_greedy_policy
from tf_agents.policies import greedy_policy
from tf_agents.policies import q_policy
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.utils import composite
from tf_agents.utils import eager_utils
from tf_agents.utils import nest_utils
from tf_agents.utils import training as training_lib
from tf_agents.utils import value_ops
from tf_agents.networks import actor_distribution_network
from tf_agents.agents.reinforce import reinforce_agent




import matplotlib.pyplot as plt

import tensorflow as tf

tf.compat.v1.enable_v2_behavior()

In [2]:
class MM1NQueue(object):
    '''
    define a M/M/1/N queue, where N = max queue size
    '''
    def __init__(self, ini_num_pkt, max_num_pkt, arrival_rate, service_rate):
        self.ini_num_pkt = ini_num_pkt
        self.max_num_pkt = max_num_pkt
        self.arrival_rate = float(arrival_rate)
        self.service_rate = float(service_rate)

    def reset_ini_num_pkt(self, ini_num_pkt):
        '''
        reset the initial number of packets
        '''
        self.ini_num_pkt = ini_num_pkt

    def run_one_unit_time(self):
        '''
        run the queue for one unit time and return the (queue_length, num_sevice) tuple
        '''
        queue_length = self.ini_num_pkt
        time = 0
        num_service = 0
        while True:
            t_arrival = np.random.exponential(scale=1/self.arrival_rate)
            t_service = np.random.exponential(scale=1/self.service_rate)
            if t_arrival > t_service:
                time = time + t_service
                if time > 1:
                    break
                if queue_length:
                    num_service += 1
                    queue_length -= 1
            else:
                time = time + t_arrival
                if time > 1:
                    break
                queue_length = min(self.max_num_pkt, queue_length+1)
        return queue_length, num_service

    def run_multiple_unit_slots(self):
        '''
        invoke self.run_one_unit_time multiple times and obtain the following
        statistics:
        end_queue_length_frequency: a list with size self.max_num_pkt+1,
                                    end_queue_length_frequency[n] is the frequency
                                    of the queue length = n at the end of one unit time
        average_num_service: the average number of services at the end of one unit time
        '''
        end_queue_length_frequency = np.array([0]*(self.max_num_pkt+1), dtype=float)
        average_num_service = 0.0
        num_runs = 10000
        for _ in range(num_runs):
            (queue_length, num_service) = self.run_one_unit_time()
            end_queue_length_frequency[queue_length] += 1
            average_num_service += num_service
        average_num_service = average_num_service/num_runs
        end_queue_length_frequency = end_queue_length_frequency/float(num_runs)
        return (end_queue_length_frequency, average_num_service)

In [15]:
class CarRentalEnv(py_environment.PyEnvironment):
    def __init__(self, locations=2, max_cars=10, max_days=100):
        self.locations = locations
        self.max_cars = max_cars
        self.max_days = max_days

        self._action_spec = array_spec.BoundedArraySpec(shape=(locations, locations),
                                                        dtype=np.float32,
                                                        minimum = -np.ones((locations, locations), dtype=np.float32)*max_cars,
                                                        maximum = np.ones((locations, locations), dtype=np.float32)*max_cars,
                                                        name='action')

        self._observation_spec = array_spec.BoundedArraySpec(shape=(locations, 2),
                                                             dtype=np.float32,
                                                             minimum=np.zeros((locations,2), dtype=np.float32),
                                                             maximum=np.ones((locations,2), dtype=np.float32)*max_cars,
                                                             name='observation')

        self._state = np.zeros((locations, 2), dtype=np.float32)
        self._episode_ended = False
        self._num_days = 0
        self.max_days = 10

        self.location_queues = []
        self.expected_rewards = []

        for i in range(locations):
            self.location_queues.append(MM1NQueue(0, max_cars+1, np.random.randint(1, 5),
                                             np.random.randint(1, 5)))
            self.expected_rewards.append(np.zeros(shape=(self.max_cars + 1)))

        self.calc_all_rewards()

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = np.zeros((self.locations, 2), dtype=np.float32)
        self._episode_ended = False
        self._num_days = 0
        return ts.restart(np.array(self._state, dtype=np.float32))        

    def _step(self, action):

        if self._episode_ended:
            return self.reset()

        self.move(action)
        reward = self.get_daily_reward()

        self._num_days += 1

        if self.game_over():
            self._episode_ended = True
            return ts.termination(np.array(self._state, dtype=np.float32), reward)
        else:
            return ts.transition(
                np.array(self._state, dtype=np.float32), reward=reward, discount=0.9)

    def calc_all_rewards(self):
        '''
        Reward of a morning state.
        '''
        for i, queue in enumerate(self.location_queues):
            for ini_num_car in range(self.max_cars + 1):
                queue.reset_ini_num_pkt(ini_num_car)
                probs, reward = queue.run_multiple_unit_slots()
                self.expected_rewards[i][ini_num_car] = 10*reward

    def get_daily_reward(self):
        """Gets the reward after a move has been made"""
        total_reward = 0.
        for loc in range(self.locations):
            num_cars = int(self._state[loc, 1])
            total_reward += self.expected_rewards[loc][num_cars]
        return total_reward

    def move(self, action):
        """
        """
        self._state = np.random.uniform(self.max_cars, size=(self.locations, 2)).astype(np.float32)

    def game_over(self):
        return self._num_days > self.max_days

In [16]:
np.random.uniform(10, size=(5, 5))

array([[6.96205797, 9.32703795, 6.47160836, 1.43589805, 4.62084007],
       [2.72276102, 8.65924479, 1.32450843, 4.4273154 , 6.58971537],
       [9.27760973, 4.20312408, 1.97592106, 8.64002272, 7.32643723],
       [6.97344278, 5.32473809, 3.29134788, 7.3730156 , 3.85720519],
       [1.09244375, 8.74075784, 2.84598796, 2.31855688, 1.172303  ]])

In [17]:
locations = 3

env = CarRentalEnv(locations)
utils.validate_py_environment(env, episodes=2)

tf_env = tf_py_environment.TFPyEnvironment(env)

In [18]:
action = np.random.uniform(0, 10, size=(1, locations, locations))
print(action)
time_step = tf_env.step(action.astype(np.float32))
print(time_step.observation, time_step.reward)
time_step = tf_env.step(action.astype(np.float32))
print(time_step.observation, time_step.reward)

[[[6.21958089 0.02405831 6.74314902]
  [4.88577975 6.1918605  7.15998257]
  [1.20955561 6.2902481  8.92958055]]]
tf.Tensor(
[[[0. 0.]
  [0. 0.]
  [0. 0.]]], shape=(1, 3, 2), dtype=float32) tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor(
[[[2.825517  6.4465   ]
  [9.594267  8.008684 ]
  [8.235454  2.9357452]]], shape=(1, 3, 2), dtype=float32) tf.Tensor([82.063], shape=(1,), dtype=float32)


In [19]:
print(tf_env.time_step_spec(), tf_env.action_spec())

TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(3, 2), dtype=tf.float32, name='observation', minimum=array([[0., 0.],
       [0., 0.],
       [0., 0.]], dtype=float32), maximum=array([[10., 10.],
       [10., 10.],
       [10., 10.]], dtype=float32))) BoundedTensorSpec(shape=(3, 3), dtype=tf.float32, name='action', minimum=array([[-10., -10., -10.],
       [-10., -10., -10.],
       [-10., -10., -10.]], dtype=float32), maximum=array([[10., 10., 10.],
       [10., 10., 10.],
       [10., 10., 10.]], dtype=float32))


In [20]:
locations = 2

env = CarRentalEnv(locations)
utils.validate_py_environment(env, episodes=2)

train_env = tf_py_environment.TFPyEnvironment(env)

env = CarRentalEnv(locations)
utils.validate_py_environment(env, episodes=2)

eval_env = tf_py_environment.TFPyEnvironment(env)

In [21]:
actor_net = actor_distribution_network.ActorDistributionNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=(20, 20))

In [22]:
learning_rate = 0.0001

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = reinforce_agent.ReinforceAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    actor_network=actor_net,
    optimizer=optimizer,
    normalize_returns=True,
    train_step_counter=train_step_counter)
tf_agent.initialize()

In [23]:
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

In [24]:
replay_buffer_capacity = 1000

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_capacity)

In [25]:

#@test {"skip": true}

def collect_episode(environment, policy, num_episodes):

  episode_counter = 0
  environment.reset()

  while episode_counter < num_episodes:
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    replay_buffer.add_batch(traj)

    if traj.is_boundary():
      episode_counter += 1
    
    

#@test {"skip": true}
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

In [26]:
num_eval_episodes = 2
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
tf_agent.train = common.function(tf_agent.train)

# Reset the train step
tf_agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
returns = [avg_return]

In [27]:
print(returns)

[586.7744]


In [28]:
num_iterations = 250 # @param {type:"integer"}
collect_episodes_per_iteration = 2 # @param {type:"integer"}
replay_buffer_capacity = 2000 # @param {type:"integer"}

fc_layer_params = (10,10)


log_interval = 25 # @param {type:"integer"}
num_eval_episodes = 10 # @param {type:"integer"}
eval_interval = 50 # @param {type:"integer"}

for _ in range(num_iterations):

  # Collect a few episodes using collect_policy and save to the replay buffer.
  collect_episode(
      train_env, tf_agent.collect_policy, collect_episodes_per_iteration)

  # Use data from the buffer and update the agent's network.
  experience = replay_buffer.gather_all()
  train_loss = tf_agent.train(experience)
  replay_buffer.clear()

  step = tf_agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

step = 25: loss = 5.090640068054199
step = 50: loss = 3.620675563812256
step = 50: Average Return = 590.1820068359375
step = 75: loss = -0.9956367015838623
step = 100: loss = -0.643332839012146
step = 100: Average Return = 584.0836181640625
step = 125: loss = -3.1457419395446777


KeyboardInterrupt: 