# 前期准备

## 创建环境

In [1]:
import gymnasium as gym

# 创建Lunar Lander环境，这里使用离散动作空间的版本
env = gym.make("LunarLander-v2")


## 探索环境
了解 动作空间, 观察空间

后续 代理设计 需要

In [3]:
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Discrete(4)
Observation Space: Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)


## 示例代码

随机动作

In [1]:
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

for _ in range(1000):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    
    observation, reward, terminated, truncated, info = env.step(action)
    
    # print("{}, {}, {}, {}".format(observation, reward, terminated, truncated))

    if terminated or truncated:
        observation, info = env.reset()

env.close()

## 显卡测试

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

pip install torch==2.2.2+cu118 torchvision==0.14.2+cu118 torchaudio==0.12.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html


In [4]:
import torch

print("PyTorch version: ", torch.__version__)
print("CUDA version: ", torch.version.cuda)

# 输出是否可以使用 CUDA
print("CUDA available: ", torch.cuda.is_available())


PyTorch version:  2.2.2+cu118
CUDA version:  11.8
CUDA available:  True


# 正式工作 (使用DQN)

训练一个Agent, 根据Observation和info, 决定最佳Action

## 01 导入并初始化环境

In [2]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

env = gym.make("LunarLander-v2")


## 02 定义DQN网络模型

通过PyTorch的nn模块, 来构建神经网络

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        "state_size: 状态空间的维度"
        "action_size: 动作空间的维度"
        super(DQN, self).__init__()
        # 定义神经网络的结构
        # 输入层的维度是状态空间的维度, 输出层的维度是动作空间的维度
        # 输入层 -(fc1)> 128 -(fc2)> 64 -(fc3)> 输出层 (常见的三层全连接神经网络)
        # fc1, fc2, fc3 分别是三个全连接层, 用于处理输入数据
        self.fc1 = nn.Linear(state_size, 128) 
        self.relu = nn.ReLU() # relu 是激活函数, 用于增加网络的非线性 (可以在每个全连接层后面添加, 是一个超参数)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_size)

    # 前向传播: 计算出网络的输出和损失, 用来更新网络
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x) # 返回一个向量, 其维度 = 动作空间的维度. i.e., 每个动作的Q值, 即给定状态下动作的价值(分数 / 预期回报)
    

## 03 实现DQN Agent

创建一个Agent类, 用来实现DQN的训练 (包括经验回放)

In [None]:
import random
import numpy as np
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters())

    def store(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        self.model.eval()
        with torch.no_grad():
            action_values = self.model(state)
        self.model.train()
        return np.argmax(action_values.cpu().data.numpy())

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)
        states = torch.tensor(states).float()
        actions = torch.tensor(actions)
        rewards = torch.tensor(rewards)
        next_states = torch.tensor(next_states).float()
        dones = torch.tensor(dones, dtype=torch.bool)

        Q_targets_next = self.model(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        Q_expected = self.model(states).gather(1, actions.unsqueeze(1))

        loss = nn.MSELoss()(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
