# 前期准备

## 创建环境

In [1]:
import gymnasium as gym

# 创建Lunar Lander环境，这里使用离散动作空间的版本
env = gym.make("LunarLander-v2")


## 探索环境
了解 动作空间, 观察空间

后续 代理设计 需要

In [3]:
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Discrete(4)
Observation Space: Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)


## 示例代码

随机动作

In [158]:
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()



for _ in range(1000):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    cumulative_reward = 0
    
    observation, reward, terminated, truncated, info = env.step(action)
    cumulative_reward += reward
    
    # print("{}, {}, {}, {}".format(observation, reward, terminated, truncated))

    if terminated or truncated:
        print("Episode finished after {} timesteps".format(_+1), "Cumulative Reward: ", cumulative_reward)
        observation, info = env.reset()

env.close()

Episode finished after 77 timesteps Cumulative Reward:  -100
Episode finished after 149 timesteps Cumulative Reward:  -100
Episode finished after 252 timesteps Cumulative Reward:  -100
Episode finished after 343 timesteps Cumulative Reward:  -100
Episode finished after 438 timesteps Cumulative Reward:  -100
Episode finished after 530 timesteps Cumulative Reward:  -100
Episode finished after 618 timesteps Cumulative Reward:  -100
Episode finished after 745 timesteps Cumulative Reward:  -100
Episode finished after 825 timesteps Cumulative Reward:  -100
Episode finished after 960 timesteps Cumulative Reward:  -100


## 显卡测试

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

pip install torch==2.2.2+cu118 torchvision==0.14.2+cu118 torchaudio==0.12.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html


In [4]:
import torch

print("PyTorch version: ", torch.__version__)
print("CUDA version: ", torch.version.cuda)

# 输出是否可以使用 CUDA
print("CUDA available: ", torch.cuda.is_available())


PyTorch version:  2.2.2+cu118
CUDA version:  11.8
CUDA available:  True


# 正式工作 (使用DQN)

训练一个Agent, 根据Observation和info, 决定最佳Action

## 01 导入并初始化环境

In [2]:
import random
import numpy as np
import torch
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
from collections import deque
from torch.optim.lr_scheduler import StepLR
from torch import nn

env = gym.make("LunarLander-v2")


## 02 定义DQN网络模型

通过PyTorch的nn模块, 来构建神经网络

In [3]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)


## 03 实现DQN Agent

创建一个Agent类, 用来实现DQN的训练 (包括经验回放)

In [4]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.scheduler = StepLR(self.optimizer, step_size=100, gamma=0.95)

    def act(self, state):
        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).float().to(self.device)
        elif isinstance(state, torch.Tensor) and state.device != self.device:
            state = state.to(self.device)

        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)

        self.model.eval()
        with torch.no_grad():
            action_values = self.model(state.unsqueeze(0))
        self.model.train()

        return action_values.argmax().item()

    def store(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        minibatch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device).unsqueeze(1)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.bool).to(self.device).unsqueeze(1)
        
        rewards *= rewards

        self.model.eval()
        next_q_values = self.model(next_states).detach().max(1)[0].unsqueeze(1)
        self.model.train()
        q_targets = rewards + self.gamma * next_q_values * (~dones)
        q_expected = self.model(states).gather(1, actions)

        loss = F.mse_loss(q_expected, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            self.scheduler.step()

## 04 训练Agent

In [5]:
import os
import os
from torch.optim.lr_scheduler import StepLR



# 设置环境和代理
env = gym.make("LunarLander-v2")
agent = DQNAgent(env.observation_space.shape[0], env.action_space.n)

# 检查和设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
agent.model.to(device)  # 确保模型在正确的设备上

# 创建模型保存目录
current_dir = os.getcwd()
model_dir = os.path.join(current_dir, "models")
os.makedirs(model_dir, exist_ok=True)

# 训练参数
e_range = 10000
time_range = 500
batch_size = 64

for e in range(e_range):
    full_state = env.reset()
    if isinstance(full_state, tuple):  # 检查是否是元组
        state = full_state[0]  # 假设实际状态在元组的第一位置
    else:
        state = full_state

    state = torch.from_numpy(state).float().to(device)  # 现在确保它是NumPy数组后转换为适合模型的张量形式，并直接移动到GPU

    cumulative_reward = 0

    for time in range(time_range):
        action = agent.act(state)
        next_state, reward, done, truncated, info = env.step(action)
        cumulative_reward += reward
        
        if done:
            agent.store(state.cpu().numpy(), action, reward, next_state, done) 

        state = torch.from_numpy(np.array(next_state)).float().to(device)

        if done:
            print(f"Episode: {e}/{e_range}, time: {time}, terminated: {done}, truncated: {truncated}, cumulative reward: {cumulative_reward}")
            break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
    
    agent.scheduler.step()  # 更新学习率调度器
    
    if e % 100 == 0:
        model_path = os.path.join(model_dir, f"model_0.pth")
        torch.save(agent.model.state_dict(), model_path)
        print("模型状态字典已保存至", model_path)

Using device: cuda
Episode: 0/10000, time: 67, terminated: True, truncated: False, cumulative reward: -72.9865305831216
模型状态字典已保存至 c:\Users\isrya\#MyFiles\#MyCode\GithubUoL\COMP532\AS02\models\model_0.pth
Episode: 1/10000, time: 106, terminated: True, truncated: False, cumulative reward: -156.9368802191059
Episode: 2/10000, time: 106, terminated: True, truncated: False, cumulative reward: -375.5484233366329
Episode: 3/10000, time: 61, terminated: True, truncated: False, cumulative reward: -78.2824751487159
Episode: 4/10000, time: 64, terminated: True, truncated: False, cumulative reward: -80.96136015421865
Episode: 5/10000, time: 77, terminated: True, truncated: False, cumulative reward: -402.9110318951411
Episode: 6/10000, time: 82, terminated: True, truncated: False, cumulative reward: -319.67915004648046
Episode: 7/10000, time: 91, terminated: True, truncated: False, cumulative reward: -202.80990276230744
Episode: 8/10000, time: 71, terminated: True, truncated: False, cumulative rew



Episode: 21/10000, time: 87, terminated: True, truncated: False, cumulative reward: -158.55516312515638
Episode: 22/10000, time: 90, terminated: True, truncated: False, cumulative reward: -376.2358588733045
Episode: 23/10000, time: 115, terminated: True, truncated: False, cumulative reward: -353.8014862714436
Episode: 24/10000, time: 105, terminated: True, truncated: False, cumulative reward: -402.0859588609971
Episode: 25/10000, time: 96, terminated: True, truncated: False, cumulative reward: -224.57475020188394
Episode: 26/10000, time: 76, terminated: True, truncated: False, cumulative reward: -93.25150364667252
Episode: 27/10000, time: 74, terminated: True, truncated: False, cumulative reward: -180.68946160143588
Episode: 28/10000, time: 147, terminated: True, truncated: False, cumulative reward: -124.27742852819289
Episode: 29/10000, time: 82, terminated: True, truncated: False, cumulative reward: -102.9499051608406
Episode: 30/10000, time: 115, terminated: True, truncated: False, 

  states = torch.tensor(states, dtype=torch.float32).to(self.device)


Episode: 65/10000, time: 97, terminated: True, truncated: False, cumulative reward: -376.17617060351495
Episode: 66/10000, time: 83, terminated: True, truncated: False, cumulative reward: -441.4756771871382
Episode: 67/10000, time: 82, terminated: True, truncated: False, cumulative reward: -586.8818937553797
Episode: 68/10000, time: 74, terminated: True, truncated: False, cumulative reward: -130.77436811744352
Episode: 69/10000, time: 90, terminated: True, truncated: False, cumulative reward: -351.63202546469176
Episode: 70/10000, time: 110, terminated: True, truncated: False, cumulative reward: -568.3716242379423
Episode: 71/10000, time: 259, terminated: True, truncated: False, cumulative reward: -1738.5507644225506
Episode: 72/10000, time: 68, terminated: True, truncated: False, cumulative reward: -126.67933045394986
Episode: 73/10000, time: 69, terminated: True, truncated: False, cumulative reward: -136.6416946900633
Episode: 74/10000, time: 54, terminated: True, truncated: False, c

### TensorBoard

# Test

In [65]:
import gymnasium as gym
import numpy as np

import torch# 确保从包含DQN类定义的文件中导入
import os

# 假设状态空间和动作空间的维度已知
state_size = 8  # 根据你的环境设置
action_size = 4  # 根据你的环境设置

current_dir = os.getcwd()
model_dir = os.path.join(current_dir, "models")
model_path = os.path.join(model_dir, "model_0.pth")

model = DQN(state_size, action_size)
model.load_state_dict(torch.load(model_path))
model.eval()  # 将模型设置为评估模式

env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

cumulative_reward = 0  # 初始化累积奖励为0

for _ in range(1000):
    # 将观测转换为适合模型的格式
    state = torch.tensor([observation], dtype=torch.float32)
    
    with torch.no_grad():  # 禁止torch追踪此处的梯度计算，因为我们在推理而不是训练
        action = model(state).max(1)[1].item()  # 获取最大Q值对应的动作

    observation, reward, terminated, truncated, info = env.step(action)
    cumulative_reward += reward # 累积奖励
    
    if terminated or truncated:
        print("Episode finished, reward: ", cumulative_reward)
        observation, info = env.reset()
        cumulative_reward = 0  # 重置累积奖励为0

env.close()


Episode finished, reward:  -154.83341714363897
Episode finished, reward:  -142.76265360308227
Episode finished, reward:  -164.7293732005262
Episode finished, reward:  -181.78712381957925
Episode finished, reward:  -199.46039985945043
Episode finished, reward:  -95.43729969176921
Episode finished, reward:  -59.1614729097641
Episode finished, reward:  -81.88810152982936


# Analysis
