In [1]:
import gym
import numpy as np
import math
import random

from IPython.display import clear_output, display
from ipywidgets import Output
import time

| 符号 | | 描述 |
|------|---| :------|
|<img width=200 /> | | <img width=500 /> |
| $S$ | | 环境的状态空间 |
| $A$| | agent可选择的动作空间 |
| $R(s，a)$ | | 奖励函数，返回的值表示在s状态下执行a动作的奖励 |
| $T(s'|s,a)$ | |  状态转移概率函数，表示从s状态执行a动作后环境转移至s′状态的概率|

<br>

目标：找到一个策略$\pi$能够最大化我们的对未来奖励的期望$E(\sum_{t=0}^n \gamma^tR_t)$ (未来收益总和），$R_t$为t时刻的奖励，$\gamma$为折扣因子，代表距离现在越遥远的奖励不如现在的奖励大

----------------------

![](assets/frozenlake.png)

In [2]:
MAX_STEPS = 20
game = 'FrozenLake-v0'
path = f'/tmp/gym-{game}'

In [3]:
env = gym.make(game)
env = env.unwrapped
print(env.action_space)
print(env.observation_space) 

Discrete(4)
Discrete(16)


In [4]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


## Q-Tables

In [5]:
qtable = np.zeros((env.observation_space.n, env.action_space.n))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## Hypes

In [6]:
total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

## Algo & Train

In [7]:
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 均匀分布, 从当前状态选择动作
        exp_exp_tradeoff = random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon: # Greedy
            action = np.argmax(qtable[state,:])

        else: # Random
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        # Bellman Equations: Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * 
                            np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print ("          left          down          right          up")
print(qtable)

Score over time: 0.4835333333333333
          left          down          right          up
[[1.98653290e-01 1.85374851e-01 9.67267071e-02 1.12160278e-01]
 [6.00929756e-03 9.10543996e-04 5.44080872e-03 1.76304760e-01]
 [1.39336251e-02 2.89212246e-02 7.47053358e-03 7.73760269e-02]
 [4.47458509e-02 5.10942068e-04 2.06908451e-04 5.65650505e-02]
 [2.19188406e-01 5.32120391e-02 5.66851958e-02 5.03485451e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.46151233e-02 2.46667460e-09 1.89912329e-05 1.74778525e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.44276017e-02 2.20195540e-02 6.73519038e-03 2.84193266e-01]
 [2.02794709e-02 4.73943087e-01 1.22406483e-01 8.71226470e-03]
 [8.86996913e-01 9.55843218e-03 1.59631417e-03 1.91359673e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.67731694e-02 1.51803611e-02 6.28038612e-01 5.53557700e-02]
 [4.61783499e-01 9.3246370

## Evaluate

In [11]:
out = Output()
display(out)
for episode in range(2):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)
    for step in range(max_steps):
        # 获取每个状态下未来奖励的期望(ER)最大的动作
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)
        with out:
            clear_output(wait=True)
            env.render()
            time.sleep(0.3)
        if done:
            break
        state = new_state
    print("Number of steps", step)
    time.sleep(1)
env.close()

Output()

****************************************************
EPISODE  0
Number of steps 28
****************************************************
EPISODE  1
Number of steps 98


![frozenlake](../assets/frozenlake.png)