## Group 7 - Yang Songyu, Qi, Yuehan, Kung Jeffrey
## The Taxi game 

In [23]:
#import the libraries for the environment
import gym
import numpy as np
from collections import namedtuple, deque
import random
import tensorflow as tf
%pip install pyvirtualdisplay
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE, MAE
from tensorflow.keras.optimizers import Adam

# set up the environment
ENV_NAME = "Taxi-v3"
env = gym.make(ENV_NAME)
env.reset()

Note: you may need to restart the kernel to use updated packages.


361

# Q-learning Implementation- 

# Monte Carlo Implementation-

# DQN Implementation- Yang Songyu

In [24]:
# set global random seeds
tf.random.set_seed(0)
# set up the hyperparameter
MEMORY_SIZE = 800_000
GAMMA = 0.95
ALPHA = 0.1
NUM_STEPS_FOR_UPDATE = 4

In [25]:
# create tuple subclasses with named fields
experience = namedtuple("Experience", 
            field_names=["state", "action", "reward", "next_state", "done"])

#Since gym libries already assign the actions and state for the Taxi game
#We do not need to assign them again
state_size = env.observation_space.n
action_size = env.action_space.n
print("Number of actions:", action_size)
print("Number of states:",state_size)

Number of actions: 6
Number of states: 500


In [26]:
# The network contains one input layer which instantiate a Keras tensor, 
# three hidden dense layer and the final dense layer should have the same size of action space
model = Sequential([
    Input(shape=state_size),
    Dense(128, activation="relu"),
    Dense(128, activation="relu"),
    Dense(128, activation="relu"),
    Dense(action_size, activation="linear")
])
target_model = Sequential([
    Input(shape=state_size),
    Dense(128, activation="relu"),
    Dense(128, activation="relu"),
    Dense(128, activation="relu"),
    Dense(action_size, activation="linear")
])
print(model.summary())
# use Adam to optimize the error
optimizer = Adam(learning_rate=0.001)


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 128)               64128     
_________________________________________________________________
dense_25 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_26 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_27 (Dense)             (None, 6)                 774       
Total params: 97,926
Trainable params: 97,926
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
# compute the loss and the Q value by using MAE based on Bellman equation
def compute_loss(experiences, gamma, model, target_model):
  states, actions, rewards, next_states, done_vals = experiences
  max_qsa = tf.reduce_max(target_model(next_states), axis=-1)
  y_targets = rewards + (gamma * max_qsa * (1-done_vals))
  q_values = model(states)
  q_values = tf.gather_nd(q_values, 
             tf.stack([tf.range(q_values.shape[0]),
             tf.cast(actions, tf.int32)], axis=1))
  loss = MSE(y_targets, q_values)
  return loss

# set up decay rate 0.01 to update the weight of the target network 
def update_target_network(model, target_model):
  TAU=0.01
  for target_weights, model_weights in zip(target_model.weights, model.weights):
    target_weights.assign(TAU * model_weights + (1.0-TAU) * target_weights)

# To compute gradients based the errors we get and update weights
def agent_learn(experiences, gamma, model, target_model, optimizer):
  with tf.GradientTape() as tape:
    loss = compute_loss(experiences, gamma, model, target_model)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  update_target_network(model, target_model)

# randomly choose action if the generated random floating numbers is less than epsilon
def get_action(q_values, epsilon=0):
  if random.random() > epsilon:
    return np.argmax(q_values.numpy()[0])
  else:
    return random.choice(np.arange(6))

# while training, check if training step is finished and wether size of input data is greater than memory_buffer
def check_update_conditions(iter, NUM_STEPS_FOR_UPDATE, memory_buffer):
  if(iter+1) % NUM_STEPS_FOR_UPDATE == 0 and len(memory_buffer) > 128:
    return True
  else:
    return False

# record the experience in sequence near its endpoints and 
# convert to tensor before fedding to exploit
def get_experiences(memory_buffer):
    experiences = random.sample(memory_buffer, k=64)
    states = tf.convert_to_tensor(np.array([e.state for e in experiences if e is not None]),dtype=tf.float32)
    actions = tf.convert_to_tensor(np.array([e.action for e in experiences if e is not None]), dtype=tf.float32)
    rewards = tf.convert_to_tensor(np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32)
    next_states = tf.convert_to_tensor(np.array([e.next_state for e in experiences if e is not None]),dtype=tf.float32)
    done_vals = tf.convert_to_tensor(np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
                                     dtype=tf.float32)
    return (states, actions, rewards, next_states, done_vals)

# calculate and get the epsilon after decay
def get_new_epsilon(epsilon):
  E_MIN = 0.01
  E_DECAY = 0.005
  return max(E_MIN, E_DECAY * epsilon)

# get next state by categorical data array into numerical values
def get_one_hot_encoding(state, next_state):
  state_arr = np.zeros(500)
  next_state_arr = np.zeros(500)
  state_arr[state] = 1
  next_state_arr[next_state] = 1
  return state_arr, next_state_arr

In [28]:
# set up the condition, train the model and get the average reward
def train(NUM_EPISODES, MAX_TIMESTEPS):
  # NUM_EPISODES = 1000
  # MAX_TIMESTEPS = 100
  memory_buffer = deque(maxlen=MEMORY_SIZE)
  target_model.set_weights(model.get_weights())
  epsilon = 0.02
  points_history = []

  for i in range(NUM_EPISODES):
    state = env.reset()
    state, _ = get_one_hot_encoding(state, 0)
    total_points = 0

    for iter in range(MAX_TIMESTEPS):
      state_qn = np.expand_dims(state, axis=0)
      q_values = model(state_qn)
      action = get_action(q_values, epsilon)
      next_state, reward, done, _ = env.step(action)
      _, next_state = get_one_hot_encoding(0, next_state)
      memory_buffer.append(experience(state, action, reward, next_state, done))
      update = check_update_conditions(iter, NUM_STEPS_FOR_UPDATE, memory_buffer)

      if update:
        experiences = get_experiences(memory_buffer)
        agent_learn(experiences, GAMMA, model, target_model, optimizer)
      state = next_state.copy()
      total_points += reward
      if done:
        break

    points_history.append(total_points)
    avg_points = np.mean(points_history[-100:])
    epsilon = get_new_epsilon(epsilon)
    print(f"\rEpisode {i+1} | Total point average of the last {100} episodes: {avg_points:.2f}", end="")

In [30]:
# call train function to get the result
train(1000,100)

Episode 1000 | Total point average of the last 100 episodes: -102.52