In [1]:
'''
    File: taxiproblem.ipynb
    
    The Taxi Problem using OpenAI Gym.
    Agent is made to solve the environment with a random policy and then with an "optimal" policy.
'''

from IPython.display import clear_output
import gym as g
import random as rnd
import numpy as np
import time

In [2]:
# Storing the Taxi v3 environment into env
env = g.make("Taxi-v3").env

In [3]:
''' Having the agent solve the environment with random policy '''

env.s = 328 # setting the environment's state to a specific state (328, in this case)
epochs = 0
penalties, reward = 0, 0

frames = [] # used to store frames for later animation

done = False

while not done:
    action = env.action_space.sample() # select a random action from the possible actions
    # then proceed to the next timestep, where the agent performs action
    state, reward, done, info = env.step(action)
    
    if reward == -10:
        # if the agent attempts to drop-off or pick-up in the incorrect state,
        # then a penalty is applied
        penalties += 1
    
    # building a dictionary of rendered frames for animation purposes
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
    })
    
    epochs += 1

print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 2111
Penalties incurred: 731


In [4]:
''' Animating the agent's attempt at solving the environment with random policy '''

for i, frame in enumerate(frames):
    clear_output(wait=True)
    print(frame['frame'])
    print("Timestep: {}".format(i+1))
    print("State: {}".format(frame['state']))
    print("Action: {}".format(frame['action']))
    print("Reward: {}".format(frame['reward']))
    time.sleep(0.1)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 2111
State: 0
Action: 5
Reward: 20


In [5]:
''' Having the agent solve the environment and learning through the use and updating of the q-table. '''

# init a q-table with a 500x6 matrix of zeros, as there's 500 states in the Taxi Problem and 6 actions
# as the agent explores, the q-table will be modified to act as a kind of memory about the reward system
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# hyperparameters, not optimized
alpha = 0.1
gamma = 0.6
epsilon = 0.1

all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()
    
    epochs, penalties, reward = 0, 0, 0
    done = False
    
    while not done:
        if rnd.uniform(0, 1) < epsilon:
            # if the uniform selection between 0 and 1 is less than epsilon then explore the action space
            action = env.action_space.sample()
        else:
            # else epsilon is greater, so exploit the values in the q-table
            action = np.argmax(q_table[state])
        
        # proceed to the next timestep, where the agent performs action and we progress to the new state
        # and the agent receives some reward
        next_state, reward, done, info = env.step(action)
        
        # the next three lines relate to applying the equation to update the q-values
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value # write to q-table
        
        if reward == -10:
            # if the agent attempts to drop-off or pick-up in the incorrect state,
            # then a penalty is applied
            penalties += 1
        
        # next state and increment epoch
        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print("Episode: {}".format(i))

print("Training complete.\n")

Episode: 100000
Training complete.



In [6]:
''' Evaluating the agent after Q-Learning by having the agent only exploit the table. '''

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    done = False

    while not done:
        action = np.argmax(q_table[state]) # exploits the table each time
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1
        
        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print("Results after {} episodes -".format(episodes))
print("Average timesteps per episode: {}".format(total_epochs / episodes))
print("Average penalties per episode: {}".format(total_penalties / episodes))

Results after 100 episodes -
Average timesteps per episode: 13.33
Average penalties per episode: 0.0
