In [3]:
# import dependencies
import gym
import random
import numpy as np

In [4]:
# Define the environment

env = gym.make("Taxi-v3")

# Initialize the q-table with zero values
state_size = env.observation_space.n
action_size = env.action_space.n
qtable = np.zeros((state_size, action_size))


In [5]:
# Hyperparameters
learning_rate  = 0.9  # Learning-rate
discount_rate = 0.8   # Discount-factor

epsilon = 1           # Exploration rate
max_epsilon = 1.0     # Exploration probability at start
min_epsilon = 0.01    # Minimum exploration probability 
decay_rate= 0.005     # Exponential decay rate for exploration prob

# training variables
total_episodes = 100000 # Total number of episodes
max_steps = 100       # Max steps per episode


In [6]:
# training
# Perform 100,000 episodes
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()[0]
    terminated = False

    # print(f"step: {episode} out of {total_episodes}")

    # Decrease epsilon
    epsilon = np.exp(-decay_rate*episode) # Update epsilon
    
    # Loop 
    for step in range(max_steps):

        # exploration-exploitation tradeoff
        if random.uniform(0,1) < epsilon:
            # explore
            action = env.action_space.sample() # Explore the action space
        else:
            # exploit
            action = np.argmax(qtable[state,:]) # Exploit learned values

        # Apply the action and see what happens, observe reward
        new_state, reward, terminated, truncated, info = env.step(action) 
        
        current_value = qtable[state, action]  # current Q-value for the state/action couple
        next_max = np.max(qtable[new_state])  # next best Q-value
        
        # Compute the new Q-value with the Bellman equation
        qtable[state, action] = current_value + learning_rate * (reward + discount_rate * next_max - current_value )

        # Update our current state
        state = new_state
    
        # If terminated (if we're dead) : finish episode
        if terminated == True: 
            break

    # Update epsilon
    epsilon = max(epsilon - decay_rate, 0)

  if not isinstance(terminated, (bool, np.bool8)):


In [7]:
print(f"Training completed over {total_episodes} episodes")
print('Q-table after training:')
print(qtable)

Training completed over 100000 episodes
Q-table after training:
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -3.78665092  -2.32697494  -3.77265608  -3.65840259  -1.6445568
  -11.31795219]
 [  0.20423483   1.53692134  -1.85451165   1.08309372   3.192
   -7.49618211]
 ...
 [ -2.3097276   -1.7983998   -2.24316     -2.3089896  -11.07936
  -11.36955744]
 [ -3.81037232  -3.57229073  -3.80735958  -3.84037219  -9.99
  -12.49217777]
 [ -1.7118       7.7998847    4.2566853   -0.999       -9.
   -9.        ]]


In [8]:

env = gym.make("Taxi-v3")
# env = gym.make("Taxi-v3", render_mode="human")
env.reset()

episodes = 100
success_rate = []
steps = []

# Evaluation
for episode in range(episodes):
    state = env.reset()[0]
    step = 0
    terminated = False
    
    # Until the agent gets stuck or reaches the goal, keep training it
    for step in range(max_steps):
        # Choose the action with the highest value in the current state
        action = np.argmax(qtable[state,:])

        # Implement this action and move the agent in the desired direction
        new_state, reward, terminated, truncated, info = env.step(action)
        
        if terminated:
            success_rate.append(int(reward == 20))
            steps.append(step)
            
            break

        # Update our current state
        state = new_state

env.close()

# Let's check our success rate!
print("Mean Step:", np.mean(steps))
print("Success rate:", np.mean(success_rate)*100, "%")

Mean Step: 12.36
Success rate: 100.0 %
