# Qlearning

 $$
 Q(s_t, a_t) \leftarrow Q(s_t, a_t) + \alpha \left[ r_{t+1} + \gamma \max_{a} Q(s_{t+1}, a) - Q(s_t, a_t) \right]
 $$
 
 其中：
 - $Q(s_t, a_t)$：当前状态$s_t$下采取动作$a_t$的Q值
 - $\alpha$：学习率，控制Q值的更新幅度
 - $r_{t+1}$：在状态$s_t$下采取动作$a_t$后获得的即时奖励
 - $\gamma$：折扣因子，权衡未来奖励的影响
 - $\max_{a} Q(s_{t+1}, a)$：在新状态$s_{t+1}$下所有可能动作的最大Q值

# Sarsa
 
  $$
  Q(s_t, a_t) \leftarrow Q(s_t, a_t) + \alpha \left[ r_{t+1} + \gamma Q(s_{t+1}, a_{t+1}) - Q(s_t, a_t) \right]
  $$
 
 其中：
  - $Q(s_t, a_t)$：当前状态$s_t$下采取动作$a_t$的Q值
  - $\alpha$：学习率，控制Q值的更新幅度
  - $r_{t+1}$：在状态$s_t$下采取动作$a_t$后获得的即时奖励
  - $\gamma$：折扣因子，权衡未来奖励的影响
  - $Q(s_{t+1}, a_{t+1})$：在新状态$s_{t+1}$下实际所选动作$a_{t+1}$的Q值



In [54]:
from os import stat
from random import uniform
import re
import numpy as np
import pandas as pd
import time

"""
state  1 2 3 4 5
left   0 0 0 0 0
right  0 0 0 1 0
"""

np.random.seed(0)

N_STATES = 6  # 1维世界宽度
ACTIONS = ["left", "right"] # 可用动作
EPSILON = 0.9 # 贪婪度 greedy
ALPHA = 0.1 # 学习率
GAMMA = 0.9 # 奖励衰减值
MAX_EPISODES = 10 # 最大回合数
FRESH_TIME = 0.2 # 移动间隔时间

def buildQTable(n_states: int, actions: list):
    """
    创建一个 Q 表（Q-Table）来存储每个状态下所有可能动作的价值。
    
    参数:
        n_states (int): 环境中的状态数量。
        actions (list): 可用的动作列表。
        
    返回:
        pd.DataFrame: 一个全零初始化的 Q 表，行表示状态，列表示动作。
    """
    table = pd.DataFrame(np.zeros((n_states, len(actions))), columns=actions)
    print(table)
    return table

def chooseAction(state: int, q_table: pd.DataFrame) -> str:
  state_actions = q_table.iloc[state, :]
  if (np.random.uniform() > EPSILON) or (state_actions.all() == 0):
    action_name = np.random.choice(ACTIONS)
  else:
    action_name = state_actions.idxmax()
  return action_name

def getEnvFeedback(S, A):
  if A == "right":
    if (S == N_STATES - 1):
      S_ = S
      R = 1
      Done = 1
    else:
      S_ = S + 1
      R = 0
      Done = 0
  else:
    R = 0
    Done = 0
    if S == 0:
      S_ = S
    else:
      S_ = S - 1

  return S_, R, Done

def UpdateEnv(S, Done, episode, step_counter):
  env_list = ["-"]*(N_STATES-1) + ["T"]
  if Done == 1:
    interaction = "Episode %s: total_steps=%s" % (episode+1, step_counter)
    print("\r{}".format(interaction), end="")
    time.sleep(1)
    print("\r                             ", end="")
  else:
    env_list[S] = "O"
    interaction = "".join(env_list)
    print("\r{}".format(interaction), end="")
    time.sleep(FRESH_TIME)
  
def RLLoop(learner: str = "Qlearn"):
  print("learner: ", learner)
  q_table = buildQTable(N_STATES, ACTIONS)
  for episode in range(MAX_EPISODES):
    step_counter = 0
    S = 0
    A = chooseAction(S, q_table=q_table)
    Done = 0
    is_terminated = False
    UpdateEnv(S, Done, episode, step_counter)

    while not is_terminated:
      if learner == "Qlearn":  # (s, a, r, s_)
        A = chooseAction(S, q_table=q_table)
        S_, R, Done = getEnvFeedback(S, A)
        q_predict = q_table.loc[S, A]
        if Done != 1:
            q_target = R+GAMMA*q_table.iloc[S_, :].max()
        else:
            q_target = R
            is_terminated = True
        q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # q_table 更新
        S = S_  # 探索者移动到下一个 state
      elif learner == "Sarsa":  # (s, a, r, s_, a_)
        S_, R, Done = getEnvFeedback(S, A)
        A_ = chooseAction(S_, q_table=q_table)
        q_predict = q_table.loc[S, A]
        if Done != 1:
          q_target = R+GAMMA*q_table.loc[S_, A_]
        else:
          q_target = R
          is_terminated = True
        q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # q_table 更新
        S = S_  # 探索者移动到下一个 state
        A = A_
      else:
        print("\n error: learner")
        return
      UpdateEnv(S, Done, episode, step_counter+1)

      step_counter += 1
    # print(q_table)
  return q_table




In [55]:
q_table = RLLoop("Qlearn")
print('\r\nQ-table:\n')
print(q_table)

learner:  Qlearn
   left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.0
5   0.0    0.0
                             
Q-table:

           left     right
0  1.488035e-07  0.000209
1  4.475419e-06  0.001912
2  5.904900e-07  0.013794
3  6.631859e-05  0.080937
4  8.695844e-03  0.244485
5  2.268000e-03  0.651322


In [56]:
q_table = RLLoop("Sarsa")
print('\r\nQ-table:\n')
print(q_table)

learner:  Sarsa
   left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.0
5   0.0    0.0
                             
Q-table:

           left     right
0  5.314410e-08  0.000087
1  2.375749e-07  0.001026
2  1.501268e-05  0.008937
3  2.979772e-04  0.056510
4  5.904900e-05  0.237124
5  7.290000e-04  0.651322
