### Import Libraries

In [1]:
import gym
import numpy as np
import random

### Create Game Environment

In [2]:
env = gym.make("Taxi-v3")

action_space = env.action_space.n
state_size = env.observation_space.n
print(action_space, state_size)

6 500


### Creating Q-Table

In [3]:
qtable = np.zeros((state_size, action_space))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


### Defining Hyper-parameters

In [4]:
# General parameters
total_episodes = 5000000
lr = 0.8
max_steps = 250
gamma = 0.95

# Exploration parameters
epsilon = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

### Q-Learning Algorithm

In [5]:
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_reward = 0
    
    for step in range(max_steps):
        
        if random.uniform(0, 1) > epsilon:
            action = np.argmax(qtable[state, :])
            
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + lr*(reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_reward += reward
        state = new_state
        
        if done: 
            break
            
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        rewards.append(total_reward)
        
print("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)

Score over time: -88.5561628
[[ 0.          0.          0.          0.          0.          0.        ]
 [ 2.75200369  3.94947757  2.75200369  3.94947757  5.20997639 -5.05052243]
 [ 7.93349184  9.40367562  7.93349184  9.40367562 10.9512375   0.40367562]
 ...
 [-3.98288634 12.58025     7.84004188  6.78119335 -1.54429903 -3.42182875]
 [-4.66056695 -5.05070569 -5.11149957  6.53681725 -5.68219033 -4.15391728]
 [16.1        14.295      16.1        18.          7.1         7.1       ]]


### Play game using Q-Table

In [6]:
env.reset()

for episode in range(10):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        # env.render()    # Un-comment to watch every game step
            
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
Number of steps 14
****************************************************
EPISODE  1
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps 13
****************************************************
EPISODE  2
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
Number of steps 8
****************************************************
EPISODE  3
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps 11
****************************************************
EPISODE  4
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---