In [7]:
import gym
import numpy as np
import random
import os
from time import sleep
class bcolors:
    RED= '\u001b[31m'
    GREEN= '\u001b[32m'
    RESET= '\u001b[0m'
# Create an instance of the 'Taxi' environment
env = gym.make('Taxi-v3')
# Initialise Q-table
state_size = env.observation_space.n
action_size = env.action_space.n
qtable = np.zeros((state_size, action_size))
# Hyperparameters 
learning_rate = 0.9
discount_rate = 0.8
epsilon = 1.0
decay_rate = 0.01 
# Variables controlling how long our agent will train for
num_episodes = 1000
num_steps = 99
print("AGENT IS TRAINING...")
for episode in range(num_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
 
    for step in range(num_steps):
        # Exploration-exploitation tradeoff
        if random.uniform(0,1) < epsilon:
            # Explore
            action = env.action_space.sample()
        else:
            # Exploit
            action = np.argmax(qtable[state,:])
        # Take an action and observe the reward
        new_state, reward, done, info = env.step(action)
        # Q-learning algorithm
        qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:]) - qtable[state,action])
        # Update to our new state
        state = new_state
        # if done, finish episode
        if done == True:
            break
    # Decrease epsilon
    epsilon = np.exp(-decay_rate*episode)
 
# Get ready to watch our trained agent
os.system('cls')
print(qtable)
print(f"Training completed over {num_episodes} episodes")
input("Press Enter to see our trained agent play")
sleep(1)
os.system('cls')  
episodes_to_preview = 3
for episode in range(episodes_to_preview):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    episode_rewards = 0
    for step in range(num_steps):
        # clear screen
        os.system('cls')
        print(f"TRAINED AGENT")
        print("+++++EPISODE {}+++++".format(episode+1))
        print("Step {}".format(step+1))
        # Exploit
        action = np.argmax(qtable[state,:])
        # Take an action and observe the reward
        new_state, reward, done, info = env.step(action)
        # Accumulate our rewards    
        episode_rewards += reward
        env.render()
        print("")
        if episode_rewards < 0:
            print(f"Score: {bcolors.RED}{episode_rewards}{bcolors.RESET}")
        else:
            print(f"Score: {bcolors.GREEN}{episode_rewards}{bcolors.RESET}")
        sleep(0.5)   
 
        # Update to our new state
        state = new_state
        # if done, finish episode
        if done == True:
            break 
# Close the Taxi environment
env.close()

AGENT IS TRAINING...
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -4.14887561  -3.81295393  -3.80905593  -4.00527816  -1.6445568
  -11.26461528]
 [ -3.24312852  -2.8472447   -3.10849016   1.09278353   3.192
   -7.62301929]
 ...
 [ -2.8000381    5.24        -2.79447286  -2.3603112   -9.9
   -6.1272    ]
 [ -3.26566656  -3.3192568   -3.44482249  -1.0478219  -10.78956
  -11.18959776]
 [ -1.7118      -1.638        9.61135199  14.99999983  -9.
   -9.        ]]
Training completed over 1000 episodes
Press Enter to see our trained agent play
TRAINED AGENT
+++++EPISODE 1+++++
Step 1
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)

Score: [31m-1[0m
TRAINED AGENT
+++++EPISODE 1+++++
Step 2
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)

Score: [31m-2[0m
TRAINED AGENT
+++++EPISODE 1+++++
Step 3
+---------+
|R: | : :G|
| : | : : |
|[