In [1]:
import gym
import numpy as np

In [2]:
env = gym.make("Taxi-v3")
state = env.reset()

In [3]:
print(state)
env.render()

146
+---------+
|R: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



## Possible Actions
down (0), up (1), left (3), pick-up (4), and drop-off (5)

In [4]:
n_states = env.observation_space.n
n_actions = env.action_space.n

## How good does behaving completely random do?

In [5]:
state = env.reset()
counter = 0
g = 0
reward = None
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward
print("Solved in {} Steps with a total reward of {}".format(counter,g))

Solved in 1239 Steps with a total reward of -4935


In [6]:
episodes = 1

In [7]:
Q = np.zeros([n_states, n_actions])
Q.shape

(500, 6)

In [8]:
# This multidimensional array will keep a history of our Q-Values for all states
Q_hist = np.zeros([n_states, n_actions, 0])
Q_hist.shape

(500, 6, 0)

In [9]:
alpha = 0.618
G = 0

In [10]:
for episode in range(1, episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    firstState = state
    print("Initial State = {}".format(state))
    while reward != 20:
        action = np.argmax(Q[state])
        state2, reward, done, info = env.step(action)
        Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action])
        G += reward
        state = state2

        # This will keep a history of Q Values in a multi dimensional array
        Q_hist = np.dstack((Q_hist, Q))
finalState = state
print("Final State = {}".format(finalState))

Initial State = 269


Final State = 85


In [11]:
print(firstState)
Q_hist[firstState,:,0]

269


array([-0.618,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ])

## Let's look at the final step:

In [12]:
Q_hist.shape

(500, 6, 973)

In [13]:
numSteps = Q_hist.shape[2]-1
print(numSteps)

972


In [14]:
print(finalState)
Q_hist[finalState,:,numSteps]

85


array([0., 0., 0., 0., 0., 0.])

Do you expect the Q-Table to have a good value for the second last step?

In [16]:
np.argmax(Q_hist[:,:,numSteps-1], axis = 0)

array([0, 0, 0, 0, 0, 0])

No, but it knows which actions it thinks are the worst

In [17]:
np.argmin(Q_hist[:,:,numSteps-1], axis = 0)

array([ 81, 221, 281, 241,   9,   1])

In [18]:
episodes = 1000
Q = np.zeros([n_states, n_actions])
rewardTracker = []

G = 0
alpha = 0.618

for episode in range(1, episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state])
        state2, reward, done, info = env.step(action)
        Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action])
        G += reward
        state = state2
    rewardTracker.append(G)

    if episode % 100 == 0:
        print('Episode {} Reward: {}  Total Average Reward: {} '.format(episode, G, sum(rewardTracker)/len(rewardTracker)))

Episode 100 Reward: -53  Total Average Reward: -221.56 
Episode 200 Reward: -42  Total Average Reward: -141.095 
Episode 300 Reward: 6  Total Average Reward: -94.43333333333334 
Episode 400 Reward: 15  Total Average Reward: -69.54 
Episode 500 Reward: 5  Total Average Reward: -54.158 
Episode 600 Reward: 9  Total Average Reward: -43.92333333333333 
Episode 700 Reward: 7  Total Average Reward: -36.427142857142854 
Episode 800 Reward: 5  Total Average Reward: -30.96375 
Episode 900 Reward: 10  Total Average Reward: -26.654444444444444 
Episode 1000 Reward: 9  Total Average Reward: -23.24 


Now that we have learned the optimal Q values we have developed an optimal policy and have no need to train the agent anymore

In [19]:
state = env.reset()
done = None

while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    env.render()

+---------+
|R: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0