# Q-Learning

# Import

In [1]:
import gym
import numpy as np
from time import sleep
from IPython.display import clear_output

# Environment

In [2]:
env = gym.make('Taxi-v3').env

In [4]:
env.action_space

Discrete(6)

In [5]:
env.observation_space

Discrete(500)

# Random Agent

In [6]:
%%time
done = False
steps, penalties = 0, 0
while not done:
    clear_output(wait=True)
    state, reward, done, info = env.step(env.action_space.sample())
    env.render()
    if reward==-10:
        penalties += 1
    steps += 1
print(f'Steps taken: {steps}, penalties incurred: {penalties}')

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Steps taken: 3131, penalties incurred: 1065
Wall time: 4.81 s


# Q-Learning

In [7]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [8]:
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [9]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

In [10]:
%%time
for i in range(1,10001):
    state = env.reset()
    done = False
    while not done:
        if np.random.uniform() < epsilon:
            # exploration
            action = env.action_space.sample()
        else:
            # exploitation
            action = np.argmax(q_table[state])
        next_state, reward, done, info = env.step(action)
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        new_value = (1-alpha)*old_value + alpha*(reward+gamma*next_max)
        q_table[state, action] = new_value
        state = next_state
    if i%1000==0:
        print(f'Episode: {i}')

Episode: 1000
Episode: 2000
Episode: 3000
Episode: 4000
Episode: 5000
Episode: 6000
Episode: 7000
Episode: 8000
Episode: 9000
Episode: 10000
Wall time: 12.8 s


In [11]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-2.3930052 , -2.33616699, -2.37527098, -2.35314108, -2.27325184,
        -9.16157496],
       [-1.84909945, -1.49348636, -1.84927848, -1.50895688, -0.7504    ,
        -8.90552491],
       ...,
       [-1.08831672, -0.02591116, -1.08831672, -1.11746961, -2.82266006,
        -1.96      ],
       [-2.08280307, -2.07093967, -2.08379147, -2.07584331, -2.88086489,
        -3.62559153],
       [-0.196     , -0.196     ,  0.7058308 , 10.85889867, -1.93574214,
        -1.        ]])

# Evaluation

In [12]:
state = env.reset()
done = False
steps, penalties = 0, 0
while not done:
    clear_output(wait=True)
    action = np.argmax(q_table[state])
    next_state, reward, done, info = env.step(action)
    env.render()
    if reward==-10:
        penalties += 1
    steps += 1
    state = next_state
    sleep(0.1)
print(f'Steps taken: {steps}, penalties incurred: {penalties}')

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
Steps taken: 16, penalties incurred: 0


# Comment

Because this environment has finite state space and finite action space, it could be solved very quick and easily by using a simple Q-table. For any environment having small finite state and action space, Q-Learning may be the most preferred way to try first.