In [1]:
import gym
import numpy as np
import math
import random


from IPython.display import clear_output, display
from ipywidgets import Output
import time

| 符号 | | 描述 |
|------|---| :------|
|<img width=200 /> | | <img width=500 /> |
| $S$ | | 环境的状态空间 |
| $A$| | agent可选择的动作空间 |
| $R(s，a)$ | | 奖励函数，返回的值表示在s状态下执行a动作的奖励 |
| $T(s'|s,a)$ | |  状态转移概率函数，表示从s状态执行a动作后环境转移至s′状态的概率|

<br>

目标：找到一个策略$\pi$能够最大化我们的对未来奖励的期望$E(\sum_{t=0}^n \gamma^tR_t)$，$R_t$为t时刻的奖励，$\gamma$为折扣因子，代表距离现在越遥远的奖励不如现在的奖励大

----------------------

In [2]:
MAX_STEPS = 20
game = 'Taxi-v3'
path = f'/tmp/gym-{game}'

In [3]:
env = gym.make(game)
env = env.unwrapped
print(env.action_space)
print(env.observation_space) 

Discrete(6)
Discrete(500)


In [4]:
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



## Q-Tables

In [5]:
qtable = np.zeros((env.observation_space.n, env.action_space.n))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


## Hypes

In [6]:
total_episodes = 5000         # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.618                 # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

## Algo & Train

In [7]:
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 均匀分布, 从当前状态选择动作
        exp_exp_tradeoff = random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon: # Greedy
            action = np.argmax(qtable[state,:])

        else: # Random
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        # Bellman Equations: Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma *
                                    np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: -2.7242
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.50483568  -2.47628218  -2.50344543  -2.43558422  -2.32039715
  -11.42409504]
 [ -2.34529997  -1.54053054  -2.19015506  -2.2323482   -0.57891593
  -10.84216566]
 ...
 [ -2.11157136   0.34405221  -2.1595745   -2.07001531  -9.92
   -8.        ]
 [ -2.41400084  -2.43996321  -2.41400084  -2.18964195  -8.
  -10.27016909]
 [ -0.8         -0.8         -0.8         11.35999956   0.
    0.        ]]


## Evaluate

In [9]:
out = Output()
display(out)
for episode in range(2):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # 获取每个状态下未来奖励的期望(ER)最大的动作
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
       
        with out:
            clear_output(wait=True)
            env.render()
            time.sleep(0.3)
        
        if done:
            break
        state = new_state
    print("Number of steps", step)
env.close()

Output()

****************************************************
EPISODE  0
Number of steps 13
****************************************************
EPISODE  1
Number of steps 11


![Taxi](../assets/taxi.png)