本代码实现了如下功能：
- 环境为CartPole-v1，详见https://gymnasium.farama.org/environments/classic_control/cart_pole/
- 原始Q学习，Q学习+目标网络，双Q学习
- 对决网络
- 有限经验回放
- 加噪网络
- 改进的奖励计算

In [5]:
import torch

import gymnasium as gym
import torch.nn as nn
import random
import torch
from tqdm.notebook import tqdm
import numpy as np
import time
from torch.utils.tensorboard import SummaryWriter
import sys

#定义网络
"""
输入：
1. Cart Position: -4.8-4.8, 超过2.4时小车会停止运动
2. Cart Velocity：-Inf-Inf, 
3. Pole Angle：-.418, .418
4. Pole Angular Velocity：-Inf-Inf, 
"""
#加噪网络
class NoiseLinear(nn.Linear):
    def __init__(self, in_features, out_features):
        super().__init__(in_features, out_features)
        self.p_noise_weight = nn.Parameter(torch.full(self.weight.size(), 0.))
        self.p_noise_bias = nn.Parameter(torch.full(self.bias.size(), 0.))
    def forward(self, x):
        weight_noise = torch.randn_like(self.p_noise_weight,device=x.device)
        bias_noise = torch.randn_like(self.p_noise_bias,device=x.device)
        out = super().forward(x)
        if self.training:
            out = out + torch.mm(x,(self.p_noise_weight * weight_noise).T) + self.p_noise_bias * bias_noise
        return out

#原始 DQN
class DQN(nn.Module):
    def __init__(self, input_dim=4, hidden_dim = 128, output_dim = 2):
        super().__init__()
        self.model = nn.Sequential(NoiseLinear(input_dim,hidden_dim),nn.LeakyReLU(),
                                   NoiseLinear(hidden_dim,output_dim))
    
    def forward(self, x):
        return self.model(x)
    
class DuelingNetwork(nn.Module):
    def __init__(self, input_dim=4, hidden_dim = 128, output_dim = 2):
        super().__init__()
        self.model = nn.Sequential(NoiseLinear(input_dim,hidden_dim),nn.LeakyReLU(),
                                   NoiseLinear(hidden_dim,hidden_dim//2),nn.LeakyReLU())
        self.v_head = nn.Sequential(NoiseLinear(hidden_dim//2,hidden_dim//4),nn.LeakyReLU(),NoiseLinear(hidden_dim//4,1))#近似V*
        self.d_head = nn.Sequential(NoiseLinear(hidden_dim//2,hidden_dim//4),nn.LeakyReLU(),NoiseLinear(hidden_dim//4,output_dim))#近似D*
    
    def forward(self, x):
        hidden = self.model(x)
        v = self.v_head(hidden)
        d = self.d_head(hidden)
        q = v + d - d.mean(dim=1,keepdim=True) #采用mean而不是max实现
        return q
    
#定义Agent
class DQNAgent:
    def __init__(self, device, buffer_size, game_name, fea_dim, hidden_dim, action_dim, gamma = 0.98):
        self.model = DuelingNetwork(fea_dim, hidden_dim, action_dim).to(device)
        self.target_model = DuelingNetwork(fea_dim, hidden_dim, action_dim).to(device) # 为保证TD目标尽量不变， 新建目标网络，更新较慢
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.requires_grad_(False) # 目标网络不需要梯度
        self.replay_buffer = []#(s_t:,a_t,r_t,s_t+1)
        self.replay_buffer_weight = [] # 权重
        self.action_dim = action_dim
        self.fea_dim = fea_dim
        self.game_name = game_name
        self.hidden_dim = hidden_dim
        
        self.env = gym.make(self.game_name)
        
        self.buffer_size = buffer_size
        self.device = device
        self.gamma = gamma # 回报率
        self.loss = torch.nn.MSELoss()
        
    def reset(self):
        observation, info = self.env.reset()
        observation = torch.from_numpy(observation).unsqueeze(0)
        return observation
    
    def take_action(self, state: torch.Tensor | np.ndarray, epsilon: float, verbose = False) -> torch.Tensor:
        bs = 1
        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).unsqueeze(0)
        state = state.to(self.device)
        # epsilon-greedy策略
        if random.random() < epsilon:
            # 随机动作
            action = torch.randint(0,self.action_dim,(bs,))
        else:
            #最优动作价值函数
            with torch.no_grad():
                Q = self.model(state)
            action = torch.argmax(Q,dim=1)
            if verbose:
                print(f"Q:{Q[:,action].item():.2f}")
        return action
    
    def replay(self, state: torch.Tensor, epsilon: float)->bool:
        action = self.take_action(state, epsilon)
        next_state, reward, terminated,truncated, _ = self.env.step(action.item()) #
        next_state = torch.from_numpy(next_state).unsqueeze(0)
        next_state = next_state
        
        done = terminated or truncated
        done = torch.tensor([done],dtype=torch.bool)
        
        reward = self.compute_reward(next_state, done)
        reward = torch.tensor([reward])
        
        #弹出不好的的策略
        if len(self.replay_buffer) > self.buffer_size:
            self.replay_buffer.pop(0)
            self.replay_buffer_weight.pop(0)
        self.replay_buffer.append((state.cpu(), action.cpu(), reward.cpu(), next_state.cpu(), done.cpu())) 
        self.replay_buffer_weight.append(1.0)#默认权重1.0
        return next_state, action.item(), reward.item(), done.item()
    
    def sample_data(self, batch_size=32):
        #随机采样batch_size个数据
        assert len(self.replay_buffer) == len(self.replay_buffer_weight)
        total_weight = sum(self.replay_buffer_weight)
        sample_prob = [item/total_weight for item in self.replay_buffer_weight]   # 采样概率
        
        indexs = np.random.choice(range(len(self.replay_buffer)), batch_size,replace=False,p=sample_prob)#不放回采样
        indexs = indexs.tolist()
        batch = [self.replay_buffer[i] for i in indexs]
        state, action, reward, next_state, done = map(torch.cat, zip(*batch))
        state = state.to(self.device)
        action = action.to(self.device)
        reward = reward.to(self.device)
        next_state = next_state.to(self.device)
        done = done.to(self.device)
        
        return state, action, reward, next_state, done, indexs

    @torch.no_grad()
    def test(self):
        self.model.eval()
        self.target_model.eval()
        env_test = gym.make(self.game_name,render_mode = 'human')
        state, _ = env_test.reset()
        done = False
        step = 0 
        input('TESTING, press enter to continue')
        while not done:
            action = self.take_action(state, epsilon=0, verbose=True)
            next_state, reward, terminated,truncated, _ = env_test.step(action.item())
            done = terminated or truncated
            state = next_state
            step += 1
            time.sleep(0.01)
        print(f"Running {step} steps, last_state:{state}")
        
    @torch.no_grad()
    def valid(self, ):
        self.model.eval()
        env_test = gym.make(self.game_name)
        result = 0
        for i in range(5):
            state, _ = env_test.reset()
            done = False
            step = 0 
            total_reward = 0
            while not done:
                action = self.take_action(state, epsilon=0)
                next_state, reward, terminated,truncated, _ = env_test.step(action.item())
                done = terminated or truncated
                state = next_state
                total_reward += reward
                step += 1
            result += total_reward
        self.model.train()
        return result/5

    @torch.no_grad()
    def update_target_model(self, decay = 0.99):
        for param, target_param in zip(self.model.parameters(), self.target_model.parameters()):
            newparam = decay * target_param.data + (1 - decay) * param.data
            target_param.data.copy_(newparam)
            
    def compute_reward(self, state, done):
        x, x_dot, theta, theta_dot = state[0]
        angle_penalty = theta ** 2  # 杆子角度惩罚（直立时θ≈0）
        position_penalty = 0.1 * x ** 2  # 小车位置惩罚（中心时x=0）
        if done:
            return -10.0  # 倒下强惩罚
        return 1.0 - angle_penalty - position_penalty  # 奖励函数`
    
def compute_epsilon(cur_value, decay = 0.999, min_value= 0.01):
    if cur_value < min_value:
        return min_value
    else:
        return decay * cur_value
    

In [6]:
writer = SummaryWriter()    
lr = 1e-4
batch_size = 128
replay_step = 100
buffer_size = 10000
num_round = 2000
epsilon = 1
gamma = 0.9
device = 'cuda' 
game_name = 'CartPole-v1'
fea_dim = 4
hidden_dim = 128
action_dim = 2
agent = DQNAgent(device, buffer_size,game_name,fea_dim,hidden_dim,action_dim,gamma)
optimizer = torch.optim.AdamW(agent.model.parameters(), lr=lr)
tqdm_bar = tqdm(total=num_round)
train_step = 0
for cur_round in range(num_round):
    epsilon = compute_epsilon(epsilon)
    cur_time = 0
    cur_reward = 0
    state = agent.reset()
    done = False
    losses = []
    while not done:
        next_state, action, reward, done = agent.replay(state, epsilon=epsilon)
        if done:
            state = agent.reset()
        #有足够量数据才开始训练
        if cur_round > replay_step:
            #training
            b_state, b_action, b_reward, b_next_state, b_done, indexs = agent.sample_data(batch_size)
            q_values = agent.model(b_state).gather(1, b_action[:,None]).squeeze(1)
            #原始Q学习，Q学习+目标网络，双Q学习
            #选择动作和计算Q_next都用DQN——原始Q学习
            #选择动作和计算Q_next都用目标网络——Q学习+目标网络
            #选择动作用DQN，计算Q_next用目标网络——双Q学习,效果最好，本代码实现方式
            #compute
            q_values_next = agent.target_model(b_next_state)
            #choose
            with torch.no_grad():
                next_action = agent.model(b_next_state).max(1)[1]
                
            q_values_next_max = q_values_next.gather(1, next_action[:,None]).squeeze(1)
            td_target = b_reward + q_values_next_max * agent.gamma
            loss = agent.loss(q_values, td_target)# 如果td_target为常数，应该detach，但torch自动求导，所以不detach
            #更新采样权重
            weight = torch.clip((q_values - td_target).abs(), 1e-2).tolist()
            for i,w in zip(indexs,weight):
                agent.replay_buffer_weight[i] = w
            
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.model.parameters(), 1)
            optimizer.step()
            train_step += 1
            losses.append(loss.item())
            agent.update_target_model()
        cur_reward += reward
        cur_time += 1
        state = next_state
    
    # valid
    valid_reward = agent.valid()
    
    loss = np.mean(losses) if len(losses) > 0 else 0
    writer.add_scalar('train/loss', loss, cur_round)
    writer.add_scalar('train/reward', cur_reward, cur_round)
    writer.add_scalar('train/epsilon', epsilon, cur_round)
    writer.add_scalar('valid/reward', valid_reward, cur_round)
    tqdm_bar.set_postfix_str(f'Round {cur_round}/{num_round}, Reward {cur_reward}, Time {cur_time}, loss {loss}, traing step {train_step}')
    tqdm_bar.update(1)
    if valid_reward == 500:
        break
writer.close()

  0%|          | 0/2000 [00:00<?, ?it/s]

In [7]:
agent.test()

Q:10.96
Q:10.71
Q:11.09
Q:10.65
Q:11.21
Q:10.66
Q:11.33
Q:10.68
Q:10.51
Q:10.96
Q:11.68
Q:13.41
Q:11.62
Q:12.17
Q:11.52
Q:10.84
Q:9.74
Q:9.60
Q:9.83
Q:10.84
Q:11.56
Q:11.43
Q:11.36
Q:11.55
Q:11.78
Q:11.74
Q:11.71
Q:12.25
Q:11.00
Q:9.91
Q:8.89
Q:9.55
Q:9.76
Q:10.67
Q:10.27
Q:10.58
Q:11.77
Q:12.06
Q:12.33
Q:12.75
Q:12.35
Q:13.40
Q:12.94
Q:13.71
Q:14.68
Q:13.37
Q:13.22
Q:16.46
Q:15.32
Q:15.10
Q:14.24
Q:13.02
Q:12.80
Q:12.14
Q:11.38
Q:12.01
Q:11.88
Q:12.01
Q:12.18
Q:12.07
Q:12.19
Q:12.31
Q:11.79
Q:12.74
Q:11.96
Q:11.80
Q:12.27
Q:11.54
Q:12.36
Q:11.82
Q:11.30
Q:12.27
Q:11.13
Q:11.98
Q:11.59
Q:11.08
Q:12.04
Q:10.89
Q:11.59
Q:11.16
Q:11.02
Q:11.46
Q:10.83
Q:11.13
Q:10.68
Q:10.76
Q:10.85
Q:10.47
Q:10.83
Q:10.86
Q:10.80
Q:11.27
Q:10.75
Q:10.83
Q:10.96
Q:10.86
Q:11.24
Q:10.94
Q:10.94
Q:11.11
Q:11.55
Q:11.54
Q:11.15
Q:11.39
Q:11.51
Q:11.73
Q:9.18
Q:9.41
Q:9.16
Q:9.90
Q:9.87
Q:9.98
Q:10.76
Q:9.80
Q:10.04
Q:10.30
Q:9.38
Q:9.24
Q:10.27
Q:8.87
Q:9.19
Q:8.96
Q:8.48
Q:9.36
Q:8.66
Q:8.13
Q:9.45
Q:10.56
