In [2]:
import numpy as np
import random
from constants import ALL_DESTINATIONS
# This agent is only for 'go to' command. It outputs the best action from 0 to 3.
class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.max_exploration_rate = exploration_rate
        self.min_exploration_rate = 0.01
        self.exploration_decay_rate = 0.001
        # Initialize Q-table
        self.q_table = np.zeros((env.observation_space.n, 4, ALL_DESTINATIONS.__len__))
    # This function takes a destination and current state, and return an lower level action
    def choose_action(self, state, destination):
        if random.uniform(0, 1) < self.exploration_rate:
            return self.env.action_space.sample()  # Explore
        else:
            return np.argmax(self.q_table[state, :, destination])  # Exploit learned values

    def learn(self, state, action, reward, next_state, destination):
        predict = self.q_table[state, action, destination]
        target = reward + self.discount_factor * np.max(self.q_table[next_state, :, destination])
        self.q_table[state, action, destination] += self.learning_rate * (target - predict)

    def decay_exploration_rate(self, episode, total_episodes):
        self.exploration_rate = self.min_exploration_rate + \
                                (self.max_exploration_rate - self.min_exploration_rate) * np.exp(-self.exploration_decay_rate * episode)



In [None]:
# There is no need to train the agent. Simply call the go_to function.
def train_agent(env):
    agent = QLearningAgent(env)
    for episode in range(1000):
        done = False
        destination = ...  # Define or obtain the current destination

        while not done:
            action = agent.choose_action(state, destination)
            next_state, reward, done, info = env.step(action)
            agent.learn(state, action, reward, next_state, destination)
            state = next_state

        agent.decay_exploration_rate(episode, 1000)

# After training, the agent's Q-table will be updated

In [None]:
def compute_dest_index(dest):
    index = ALL_DESTINATIONS.index(dest)
    return index

def compute_state_index(env, observation):
    size = env.size
    pos_agent=observation["Agent"]
    agent_x=pos_agent[0]
    agent_y=pos_agent[1]
    state_index = agent_x * size + agent_y
    return state_index

In [None]:
# For go to command, this function returns a list of actions that agent performs to get to the target
def go_to(env, target):
    dest_index = compute_dest_index(target)
    action_taken=[]
    agent = QLearningAgent(env)
    observation=env._get_obs()
    while(not env._verify_go_to(target)):
        state_index = compute_state_index(env, observation)
        action = agent.choose_action(state_index, dest_index)
        action_taken.append(action)
        observation, reward, done, info = env.step(action)
        next_state_index = compute_state_index(env, observation)
        agent.learn(state_index, action, reward, next_state_index, dest_index)
    print(action_taken)
    return action_taken