In [None]:
import gym
import time
import random
import numpy as np
import pandas as pd
from scipy.stats import bernoulli

In [None]:
env = gym.make("Taxi-v3",render_mode="ansi")

In [None]:
# implementation of epsilon greedy action selection (multi-armed bandit)
def epsilon_greedy_agent(epsilon,state,QTable):
    x = bernoulli.rvs(1-epsilon, size=1)
    if x == 1:
        # exploit: select best performing action w/ probability 1 - epsilon
        action = np.argmax(QTable[state])
    else:
        # explore: randomly select action for exploration w/ probability epsilon
        action = env.action_space.sample()
    return action

# From Sutton and Barto (2018), p. 131
def train_Qlearning(alpha,epsilon,gamma,episodes,maxSteps,verbose=False):
    print("Begin training...")
    # initialize Q-table
    QTable = np.zeros([env.observation_space.n,env.action_space.n])
    for ep in range(episodes):
        if verbose and ep % 1000 == 0:
            print("Episode %d" % ep)
        # initialize state
        state = env.reset()[0]
        isDone = False
        for t in range(maxSteps):
            # sample action from Q
            action = epsilon_greedy_agent(epsilon,state,QTable)
            # take action, observe reward and next state
            next_state,reward,isDone,_,_ = env.step(action)
            # update Q and state
            QTable[state,action] += alpha * (reward + gamma * np.max(QTable[next_state]) - QTable[state,action])
            state = next_state
            # until state is terminal
            if isDone == True:
                break
    return QTable    

# Implementation of Watkin's Q-lambda (w/ eligibility traces)
def train_QlearningLam(alpha,epsilon,gamma,lam,epsidoes,maxSteps,verbose=False):
    print("Begin training...")
     # initialize Q-table
    QTable = np.zeros([env.observation_space.n,env.action_space.n])
    # initialize eligibility traces
    E = np.zeros([env.observation_space.n,env.action_space.n])
    for ep in range(episodes):
        if verbose and ep % 1000 == 0:
            print("Episode %d" % ep)
        # initialize state and action
        state = env.reset()[0]
        action = env.action_space.sample()
        for t in range(maxSteps):
            # take action a, observe reward and next state (s')
            next_state,reward,isDone,_,_ = env.step(action)
            # choose a' from s' using Q
            next_action = epsilon_greedy_agent(epsilon,next_state,QTable)
            # select a*
            opt_action = np.argmax(QTable[next_state])
            # compute TD error
            TD = reward + gamma * QTable[next_state,next_action] - QTable[state,opt_action]
            E[state,action] += 1
            # for all states and actions
            QTable[:] = QTable[:] + alpha * TD * E[:]
            # build trace for greedy action
            if next_action == opt_action:
                E[:] = gamma * lam * E[:]
            # zero out eligibility trace after non-greedy/exploratory action
            else:
                E[:] = 0
            state = next_state
            action = next_action
            # continue until terminal state
            if isDone:
                break
    return QTable

### Training

In [None]:
episodes = 50000
maxSteps = 1000
alpha = 0.1
epsilon = 0.1
gamma = 0.5
lam = 0.1
QTable = train_QlearningLam(alpha,epsilon,gamma,lam,episodes,maxSteps,verbose=True)

### Testing

In [None]:
test_env = gym.make("Taxi-v3",render_mode="human")
testEpisodes = 10
epRewards = []
epPenalities = []
epTimesteps = []
verbose = True
for ep in range(testEpisodes):
    if verbose:
        print("Episode %d" % (ep+1))
    start_time = time.time()
    state = test_env.reset()[0]
    test_env.render()
    rewards = []
    isDone = False
    rewards = 0
    timesteps = 0
    penalities = 0
    while not isDone:
        action = np.argmax(QTable[state,:])
        next_state,reward,isDone,_,_ = test_env.step(action)
        state = next_state
        timesteps += 1
        if reward == -10:
            penalities += 1
        rewards += reward
    if verbose:
        print("--- Completed in %s seconds ---" % (time.time() - start_time))
    epRewards.append(rewards)
    epPenalities.append(penalities)
    epTimesteps.append(timesteps)