## A simple game

We need to define actions, states, state transitions, rewards and the discount factor.

An MDP is a 5-tuple, $\langle S, A, R, P, \gamma \rangle$

--16 states

--4 actions



In [1]:
import numpy as np
np.set_printoptions(threshold=np.inf)
# Define the gridworld environment
n_states = 16
n_actions = 4
P = np.zeros((n_states, n_actions, n_states))  # transition probabilities
R = np.zeros((n_states, n_actions, n_states))  # rewards
gamma = 0.9  # discount factor

# Fill in the transition probabilities and rewards
for s in range(n_states):
    for a in range(n_actions):
        if s == 0 or s == 15:
            P[s, a, s] = 1
        else:
            if a == 0:  # up
                s_prime = s - 4
            elif a == 1:  # down
                s_prime = s + 4
            elif a == 2:  # left
                s_prime = s - 1
            else:  # right
                s_prime = s + 1
            if s_prime < 0:
              s_prime = 0
            if s_prime > 15:
              s_prime = 15

            if s_prime == 0:
                R[s, a, s_prime] = -1  # start state
            elif s_prime == 15:
                R[s, a, s_prime] = 10  # goal state
            else:
                R[s, a, s_prime] = -1  # other states
            P[s, a, s_prime] = 1 # 在这个特定的环境中，无论你在哪个状态执行什么动作，你都会确定地转移到一个新的状态，所以状态转移概率总是1

print("P", P)
print("-----------------------------")
print("R", R)
# P[s, a, s_prime]


P [[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  

## Solution - DP

In [2]:
# Define the policy (arbitrary for now)
policy = np.ones((n_states, n_actions)) / n_actions # 初始化一个均匀策略，即在每个状态下，所有动作的选择概率都是相等的。

print(policy)
for s in range(n_states):
  print("(State ", s, ") Actions", policy[s])

# Policy evaluation algorithm
V = np.zeros(n_states)  # initial value function estimate 初始化状态价值函数为零。
print("Value function:")
print(V.reshape(4, 4))
tolerance = 1e-6  # convergence tolerance 设置收敛阈值。
while True: # 开始策略评估
    delta = 0
    for s in range(n_states):
        v = V[s]
        bellman_update = 0 # 初始化贝尔曼更新为零。
        for a in range(n_actions):
            for s_prime in range(n_states):
                bellman_update += policy[s, a] * P[s, a, s_prime] * (R[s, a, s_prime] + gamma * V[s_prime]) # 根据贝尔曼期望方程进行更新
        V[s] = bellman_update
        delta = max(delta, abs(v - V[s]))
    if delta < tolerance: # 如果最大差值小于阈值，则停止迭代。
        break

print("Value function:")
print(V.reshape(4, 4))

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
(State  0 ) Actions [0.25 0.25 0.25 0.25]
(State  1 ) Actions [0.25 0.25 0.25 0.25]
(State  2 ) Actions [0.25 0.25 0.25 0.25]
(State  3 ) Actions [0.25 0.25 0.25 0.25]
(State  4 ) Actions [0.25 0.25 0.25 0.25]
(State  5 ) Actions [0.25 0.25 0.25 0.25]
(State  6 ) Actions [0.25 0.25 0.25 0.25]
(State  7 ) Actions [0.25 0.25 0.25 0.25]
(State  8 ) Actions [0.25 0.25 0.25 0.25]
(State  9 ) Actions [0.25 0.25 0.25 0.25]
(State  10 ) Actions [0.25 0.25 0.25 0.25]
(State  11 ) Actions [0.25 0.25 0.25 0.25]
(State  12 ) Actions [0.25 0.25 0.25 0.25]
(State  13 ) Actions [0.25 0.25 0.25 0.25]
(State  14 ) Actions [0.25 0.25 0.25 0.

## Q-learning algorithm

In [2]:
# Environment
E = np.zeros(n_states).reshape(4, 4)
E

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [3]:
# Initialize Q-values with zeros 16*4
Q = np.zeros((n_states, n_actions))
n_episodes = 10000  # Number of episodes for the learning
alpha = 0.03  # Learning rate 设置学习率。
epsilon = 0.6  # Epsilon-greedy exploration probability 探索概率
print(Q)
for episode in range(n_episodes):
    s = np.random.randint(n_states)  # Randomly select an initial state
    print(f"\nEpisode {episode + 1}: Starting at state {s}")
    while s not in [0, 15]:  # Continue until reaching terminal states (0 or 15)
        prob = np.random.uniform()  # Generate a probability for epsilon-greedy 生成一个随机概率
        print(f"Random probability for epsilon-greedy: {prob:.4f}")
        if prob < epsilon: # 如果随机概率小于ε，则执行探索操作，即随机选择一个动作。
            a = np.random.randint(n_actions)  # Explore: select a random action
            print(f"Exploring: Selected action {a}")
        else: # 否则，执行利用操作，即选择当前状态下Q值最大的动作。
            a = np.argmax(Q[s, :])  # Exploit: select the action with the highest Q-value
            print(f"Exploiting: Selected action {a}")
        # Take the selected action and observe the next state and reward
        s_prime = np.random.choice(range(n_states), p=P[s, a, :]) # 根据当前状态和选择的动作，按照状态转移概率选择下一个状态。
        r = R[s, a, s_prime] # 获取从当前状态执行选择的动作转移到下一个状态的奖励。

        # Print details of Q-learning update equation
        q_old = Q[s, a]  # The old Q-value
        max_q_next = np.max(Q[s_prime, :])  # The maximum Q-value for the next state
        q_target = r + gamma * max_q_next  # The target Q-value
        q_update = q_old + alpha * (q_target - q_old)  # The updated Q-value 根据Q学习的更新公式更新Q值。

        # Update Q-value for the current state-action pair
        Q[s, a] = q_update
        print(f"State: {s}, Action: {a}, Next State: {s_prime}, Reward: {r}")
        print(f"Q(s,a) old: {q_old}, Reward: {r}, Max Q(s',a'): {max_q_next}, Q target: {q_target}, Q(s,a) updated: {q_update}")
        print("Updated Q-table:\n", Q)
        s = s_prime  # Move to the next state

# Extract the optimal policy from Q-values
policy = np.argmax(Q, axis=1) # 从Q值表中提取最优策略，即对每个状态，选择Q值最大的动作。

print("\nOptimal policy:")
print(policy.reshape(4, 4))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [ 1.60019611  6.19999996  3.89072914  4.58      ]
 [ 3.11486304  6.2         3.94435622  6.18241759]
 [ 4.55608481  8.          4.35942587  6.10178206]
 [ 4.45663836  8.          6.0964741   6.11090492]
 [ 3.6027782   8.          5.34492221  5.73840682]
 [ 4.57984019  7.9984523   6.19574178  8.        ]
 [ 6.19999654 10.          6.19991243  7.99995959]
 [ 6.17996331 10.          7.93597928  7.90281173]
 [ 6.14959491 10.          7.88893684  7.90814871]
 [ 5.01230676 10.          6.66582545  8.78423345]
 [ 0.          0.          0.          0.        ]]
Random probability for epsilon-greedy: 0.5575
Exploiting: Selected action 3
State: 10, Action: 3, Next State: 11, Reward: -1.0
Q(s,a) old: 7.999999999999991, Reward: -1.0, Max Q(s',a'): 9.999999999999993, Q target: 7.999999999999995, Q(s,a) updated: 7.999999999999991
Updated Q-table:
 [[ 0.          0.          0.          0.        ]
 [-0.87842335  3.04385973 -0.8905810