# 前期准备

## 创建环境

In [1]:
import gymnasium as gym

# 创建Lunar Lander环境，这里使用离散动作空间的版本
env = gym.make("LunarLander-v2")


## 探索环境
了解 动作空间, 观察空间

后续 代理设计 需要

In [3]:
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Discrete(4)
Observation Space: Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)


## 示例代码

随机动作

In [158]:
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()



for _ in range(1000):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    cumulative_reward = 0
    
    observation, reward, terminated, truncated, info = env.step(action)
    cumulative_reward += reward
    
    # print("{}, {}, {}, {}".format(observation, reward, terminated, truncated))

    if terminated or truncated:
        print("Episode finished after {} timesteps".format(_+1), "Cumulative Reward: ", cumulative_reward)
        observation, info = env.reset()

env.close()

Episode finished after 77 timesteps Cumulative Reward:  -100
Episode finished after 149 timesteps Cumulative Reward:  -100
Episode finished after 252 timesteps Cumulative Reward:  -100
Episode finished after 343 timesteps Cumulative Reward:  -100
Episode finished after 438 timesteps Cumulative Reward:  -100
Episode finished after 530 timesteps Cumulative Reward:  -100
Episode finished after 618 timesteps Cumulative Reward:  -100
Episode finished after 745 timesteps Cumulative Reward:  -100
Episode finished after 825 timesteps Cumulative Reward:  -100
Episode finished after 960 timesteps Cumulative Reward:  -100


## 显卡测试

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

pip install torch==2.2.2+cu118 torchvision==0.14.2+cu118 torchaudio==0.12.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html


In [4]:
import torch

print("PyTorch version: ", torch.__version__)
print("CUDA version: ", torch.version.cuda)

# 输出是否可以使用 CUDA
print("CUDA available: ", torch.cuda.is_available())


PyTorch version:  2.2.2+cu118
CUDA version:  11.8
CUDA available:  True


# 正式工作 (使用DQN)

训练一个Agent, 根据Observation和info, 决定最佳Action

## 01 导入并初始化环境

In [12]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

env = gym.make("LunarLander-v2")


## 02 定义DQN网络模型

通过PyTorch的nn模块, 来构建神经网络

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        # 定义网络层
        self.fc1 = nn.Linear(state_size, 8)  # 第一个隐藏层，8个神经元
        self.fc2 = nn.Linear(8, action_size)  # 输出层
        self.relu = nn.ReLU()  # ReLU激活函数

    def forward(self, x):
        # 前向传播流程
        x = self.relu(self.fc1(x))  # 通过隐藏层并应用ReLU激活函数
        x = self.fc2(x)  # 通过输出层
        return x




In [23]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        # Define network layers
        self.fc1 = nn.Linear(state_size, 7)  # First hidden layer with 7 neurons
        self.fc2 = nn.Linear(7, 6)  
        self.fc3 = nn.Linear(6, 5)  
        self.fc4 = nn.Linear(5, 4)  
        self.fc5 = nn.Linear(4, action_size)  # Output layer
        self.relu = nn.ReLU()  # ReLU activation function

    def forward(self, x):
        # Forward propagation
        x = self.relu(self.fc1(x))  
        x = self.relu(self.fc2(x))  
        x = self.relu(self.fc3(x))  
        x = self.relu(self.fc4(x))  
        x = self.fc5(x)  
        return x


## 03 实现DQN Agent

创建一个Agent类, 用来实现DQN的训练 (包括经验回放)

In [24]:
import random
import numpy as np
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.9 # gamma 越大, 学习率衰减越快
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_size, action_size)  
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.scheduler = StepLR(self.optimizer, step_size=100, gamma=self.gamma)  # Decay every 100 steps, 

    # 定义 Agent如何根据State选择Action
    def act(self, state):
        # state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # 将状态转换为张量 (还要添加一个维度), 以便输入到网络中
        
        # 使用 ε-greedy 策略选择动作
        if np.random.rand() <= self.epsilon: # 如果随机数小于 ε, 则随机选择一个动作, 用于探索
            return random.randrange(self.action_size)
        
        # 得到Q值, 用于选择动作
        self.model.eval() # 将模型设置为评估模式, 这样可以避免在评估模型时进行梯度更新
        with torch.no_grad(): # 不需要计算梯度, 因为我们只是在评估模型
            action_values = self.model(state) # 用当前状态获取每个动作的Q值
            
        self.model.train() # 修改回训练模式, 以便在训练模型时进行梯度更新 (模型的参数可以继续更新)
        # print("Q Table: ", action_values.cpu().data.numpy())
        return np.argmax(action_values.cpu().data.numpy()) # 根据Q值选择最佳动作
    
    # 用于 经验回放 (Experience Replay)
    # 当Agent在Environment中""执行Action 并观察到新的状态和奖励时, 将这些信息存储, 之后用于训练网络模型
    def store(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # 经验回放 (Experience Replay)
    # 打破数据之间的相关性, 提高训练的稳定性
    def replay(self, batch_size):
        if len(self.memory) < batch_size: # 如果记忆库中的样本数量小于批量大小, 则不执行
            return
        
        minibatch = random.sample(self.memory, batch_size) # 从记忆库中随机选择一个批量的经验
        states, actions, rewards, next_states, dones = zip(*minibatch) # 将批量经验拆分为状态, 动作, 奖励, 下一个状态, 完成标志
        # 将拆分的经验转换为张量, 以便输入到网络中
        states = torch.tensor(states, dtype =torch.float32).to(self.device).squeeze(1)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device).unsqueeze(1)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device).squeeze(1)
        dones = torch.tensor(dones, dtype=torch.bool).to(self.device).unsqueeze(1)

        # 打印所有张量的形状
        # print("states: ", states.shape)
        # print("actions: ", actions.shape)
        # print("rewards: ", rewards)
        # print("next_states: ", next_states.shape)
        # print("dones: ", dones.shape)

        Q_targets_next = self.model(next_states).detach().max(1)[0].unsqueeze(1) # 使用目标网络计算下一个状态的Q值, 用于计算目标Q值
        Q_targets = rewards + (self.gamma * Q_targets_next * (~dones)) # 计算目标Q值, 用于更新当前状态的Q值
        Q_expected = self.model(states).gather(1, actions) # 计算预期Q值, 用于计算损失

        loss = nn.MSELoss()(Q_expected, Q_targets) # 计算均方误差损失
        self.optimizer.zero_grad() # 梯度清零, 以便在每次迭代中重新计算梯度
        loss.backward() # 反向传播, 计算梯度
        self.optimizer.step() # 更新网络参数

        if self.epsilon > self.epsilon_min: # 更新 ε, 以便在训练过程中逐渐减小探索的概率, 以便在初期更多地探索, 在后期更多地利用经验
            self.epsilon *= self.epsilon_decay


## 04 训练Agent

In [25]:
import os
import os
from torch.optim.lr_scheduler import StepLR

batch_size = 64
env = gym.make("LunarLander-v2")

current_dir = os.getcwd()
model_dir = os.path.join(current_dir, "models")
os.makedirs(model_dir, exist_ok=True) # 创建模型保存目录


agent = DQNAgent(env.observation_space.shape[0], env.action_space.n)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
agent.model.to(device)  # 移动模型到GPU

# 定义训练参数
e_range = 3000
time_range = 100

for e in range(e_range): # 训练1000个episode
    full_state = env.reset()
    state = full_state[0] # 提取向量
    # print("初始状态", state)
    state = torch.from_numpy(np.reshape(state, [1, -1])).float().to(device) # 把state转换为 网络模型 接受的形状
    
    cumulative_reward = 0  # 初始化累积奖励为0
    
    for time in range(time_range): # 一个episode最多执行500个时间步
        action = agent.act(state)
        next_state, reward, done, truncated, info = env.step(action)  # 执行动作, 获取下一个状态, 奖励, 完成标志, 和
        cumulative_reward += reward # 累积奖励
        
        # print("下一个状态", next_state)
        next_state = torch.from_numpy(np.reshape(next_state, [1, -1])).float().to(device) # 把next_state转换为 网络模型 接受的形状
        reward = torch.tensor([reward], device=device) # 把reward转换为张量
        done = torch.tensor([done], device=device)
        
        # agent.store(state, action, reward, next_state, done) # 存储经验
        agent.store(state.cpu().numpy(), action, reward.cpu().numpy(), next_state.cpu().numpy(), done.cpu().numpy()) # 存储经验
        state = next_state # 更新状态
        
        if done:
            print("Episode: {}/{}, score: {}, cumulative reward: {}".format(e, e_range, time, cumulative_reward)) # 打印每个episode的时间步数和累积奖励
            break
        if len(agent.memory) > batch_size: # 当记忆库中的样本数量大于32时, 开始经验回放
            agent.replay(batch_size)
            
            
    agent.scheduler.step()  # 学习速率调度器更新
    
    # 每100个episode结束后, 保存模型的状态字典
    if e % 20 == 0:
        model_path = os.path.join(model_dir, f"model_0.pth")  # 使用 f-string 包含 episode 数
        torch.save(agent.model.state_dict(), model_path)
        print("模型状态字典已保存至", model_path)

    


Using device: cuda
Episode: 0/3000, score: 67, cumulative reward: -95.80954624123588
模型状态字典已保存至 c:\Users\isrya\#MyFiles\#MyCode\GithubUoL\COMP532\AS02\models\model_0.pth


  dones = torch.tensor(dones, dtype=torch.bool).to(self.device).unsqueeze(1)
  return F.mse_loss(input, target, reduction=self.reduction)


Episode: 1/3000, score: 81, cumulative reward: -150.93571883780425
Episode: 3/3000, score: 56, cumulative reward: -296.1826519179241
Episode: 4/3000, score: 51, cumulative reward: -334.4605762490487
Episode: 5/3000, score: 58, cumulative reward: -137.91389478289332
Episode: 6/3000, score: 87, cumulative reward: -340.0625874668472
Episode: 7/3000, score: 70, cumulative reward: -41.424303969942684
Episode: 9/3000, score: 79, cumulative reward: -603.5459799491896
Episode: 10/3000, score: 95, cumulative reward: -522.4188765948097
Episode: 11/3000, score: 59, cumulative reward: -99.7004614266726
Episode: 12/3000, score: 55, cumulative reward: -117.03665666605909
Episode: 14/3000, score: 72, cumulative reward: -152.46315329583047
Episode: 15/3000, score: 57, cumulative reward: -490.89360495708286
Episode: 16/3000, score: 84, cumulative reward: -488.62702214695184
Episode: 17/3000, score: 85, cumulative reward: -238.31608879068867
Episode: 18/3000, score: 54, cumulative reward: -171.896354304

KeyboardInterrupt: 

### TensorBoard

# Test

In [26]:
import gymnasium as gym
import numpy as np

import torch# 确保从包含DQN类定义的文件中导入
import os

# 假设状态空间和动作空间的维度已知
state_size = 8  # 根据你的环境设置
action_size = 4  # 根据你的环境设置

current_dir = os.getcwd()
model_dir = os.path.join(current_dir, "models")
model_path = os.path.join(model_dir, "model_0.pth")

model = DQN(state_size, action_size)
model.load_state_dict(torch.load(model_path))
model.eval()  # 将模型设置为评估模式

env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

cumulative_reward = 0  # 初始化累积奖励为0

for _ in range(1000):
    # 将观测转换为适合模型的格式
    state = torch.tensor([observation], dtype=torch.float32)
    
    with torch.no_grad():  # 禁止torch追踪此处的梯度计算，因为我们在推理而不是训练
        action = model(state).max(1)[1].item()  # 获取最大Q值对应的动作

    observation, reward, terminated, truncated, info = env.step(action)
    cumulative_reward += reward # 累积奖励
    
    if terminated or truncated:
        print("Episode finished, reward: ", cumulative_reward)
        observation, info = env.reset()
        cumulative_reward = 0  # 重置累积奖励为0

env.close()


Episode finished, reward:  -148.33966024162137
Episode finished, reward:  -129.56642250217718
Episode finished, reward:  -152.88066544844287
Episode finished, reward:  -140.5931593800495
Episode finished, reward:  -113.11040461008781
Episode finished, reward:  -128.4038435086263
Episode finished, reward:  -149.8433896797823
Episode finished, reward:  -105.58485566217254
Episode finished, reward:  -124.97611273002431
Episode finished, reward:  -123.97436145284013
Episode finished, reward:  -139.27758897290153
Episode finished, reward:  -106.00463601449772
Episode finished, reward:  -157.23882840667795
Episode finished, reward:  -157.5075288521311


# Analysis
