In [1]:
from gym_interface import Agent, State
import copy
import math
import random
import socket
import time
from collections import deque, namedtuple
from typing import Dict, Iterable, List, Literal, Optional, Union

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from distutils.util import strtobool
# from rlmodel.utils.utils import print_args, print_box, connected_to_internet
import wandb
import setproctitle
from pathlib import Path

import os, sys
from PPO import PPO

✓ 使用 Gymnasium
Device set to : NVIDIA GeForce RTX 3060


*自定义处理函数*

In [4]:
def parse_Input(action: int) -> str:
    # example:
    if action == -1:
        return ""

    TacticID = f'<AgentCMD><uint64_t>{action}</uint64_t></AgentCMD>'
    s = '<c>' + TacticID + '</c>'

    return s

In [5]:
def parse_Output(state: Dict[str, any]) -> dict:
    #example:
    tmp = []
    Input = {}
    for input in state:
        for k, v in input.items():
            if k == 'PlaneInfo':
                tmp.append(v)
            else:
                Input[k] = v
    Input['PlaneInfo'] = tmp
    return Input

In [6]:
def outputToTensor(state: Dict[str, any]) -> np.array:
    self = state['PlaneInfo'][0]['entity_motion_state']
    enemy = state['TargetMessage']['entity_motion_state']
    self_pitch = self['AttitudeInfo']['pitch']/10
    self_yaw = self['AttitudeInfo']['yaw']/10
    self_roll = self['AttitudeInfo']['roll']/10
    self_lon = self['PositionInfo']['longitude']/10
    self_lat = self['PositionInfo']['latitude']/10
    self_alt = self['PositionInfo']['altitude']/100
    self_vx = self['ECEFVelocity']['vx']/10
    self_vy = self['ECEFVelocity']['vy']/10
    self_vz = self['ECEFVelocity']['vz']/10
    enemy_pitch = enemy['AttitudeInfo']['pitch']/10
    enemy_yaw = enemy['AttitudeInfo']['yaw']/10
    enemy_roll = enemy['AttitudeInfo']['roll']/10
    enemy_lon = enemy['PositionInfo']['longitude']/10
    enemy_lat = enemy['PositionInfo']['latitude']/10
    enemy_alt = enemy['PositionInfo']['altitude']/100
    enemy_vx = enemy['ECEFVelocity']['vx']/10
    enemy_vy = enemy['ECEFVelocity']['vy']/10
    enemy_vz = enemy['ECEFVelocity']['vz']/10
    return np.array([self_pitch, self_yaw, self_roll, self_lon, 
                     self_lat, self_alt, self_vx, self_vy, self_vz, 
                     enemy_pitch, enemy_yaw, enemy_roll, enemy_lon, 
                     enemy_lat, enemy_alt, enemy_vx, enemy_vy, enemy_vz])

In [7]:
def cal_Reward(state:Dict[str, any]):
    #example:
    global count
    if not state["PlaneInfo"]:
        return 0
    self = state['PlaneInfo'][0]['entity_motion_state']
    enemy = state['TargetMessage']['entity_motion_state']
    self_lon = self['PositionInfo']['longitude']
    self_lat = self['PositionInfo']['latitude']
    self_alt = self['PositionInfo']['altitude']/1000
    enemy_lon = enemy['PositionInfo']['longitude']
    enemy_lat = enemy['PositionInfo']['latitude']
    enemy_alt = enemy['PositionInfo']['altitude']/1000
    self_state = state['PlaneInfo'][0]['entity_state']
    enemy_state = state['TargetMessage']['entity_state']
    # dist_to_goal = - np.sqrt(np.max(np.square(self_lon-enemy_lon), 0) + np.max(np.square(self_lat-enemy_lat), 0) + np.max(np.square(self_alt-enemy_alt), 0))
    # r = dist_to_goal
    r = 0
    if(enemy_state == 5):
        # print("敌死")
        r += 500
    elif(self_state == 5):
        # print("我亡")
        r -= 500
    # else:
    #     r -= 1

    return r

In [8]:
def cal_Termination(state:Dict[str, any]) -> bool:
    #example
        self = state['PlaneInfo'][0]['entity_state']
        enemy = state['TargetMessage']['entity_state']
        if self == 5 or enemy == 5:
            return True
        else:    
            return False


_算法参数配置_

In [9]:
device = torch.device("cuda")
sys.path.append(os.path.abspath(os.getcwd()))
num_agents = 1
num_enemies = 1
episode_length = 100
save_interval = 1000
log_interval = 10
model_dir = (
        Path(os.path.dirname(os.path.dirname(os.getcwd())) + "/results")
        /"save"
    )
all_args = {
    "algorithm_name": "ppo",
    "use_recurrent_policy": False,
    "use_naive_recurrent_policy": False,
    "share_policy": True,
    "use_wandb": True,
    "seed": 0,
    "use_centralized_V": True,
    "use_linear_lr_decay": True,
    "hidden_size": 16,
    "recurrent_N": 1,
    "act_space": 6,
    "obs_space": 18,
    "shared_obs_space": 18*num_agents,
    "model_dir": None,
    "episode_length": episode_length,
    "gamma": 0.98,
    "gae_lambda": 0.95,
    "use_gae": True,
    "clip_param": 0.2,
    "ppo_epoch": 15,
    "num_mini_batch": 1,
    "data_chunk_length": 10,
    "value_loss_coef": 0.5,
    "entropy_coef": 0.01,
    "max_grad_norm": 10.0,
    "huber_delta": 10.0,
    "use_max_grad_norm": True,
    "use_clipped_value_loss": True,
    "use_huber_loss": True,
    "use_popart": True,
    "use_valuenorm": False,
    "use_value_active_masks": True,
    "use_policy_active_masks": True,
    "lr": 7e-5,
    "critic_lr": 7e-4,
    "opti_eps": 1e-5,
    "weight_decay": 0,
    "gain": 0.01,
    "use_orthogonal": True,
    "use_feature_normalization": True,
    "use_ReLU": False,
    "stacked_frames": 1,
    "layer_N": 1,
    "n_rollout_threads": 1,
}


run_dir = (
        Path(os.path.dirname(os.path.dirname(os.getcwd())) + "/results")
        / all_args["algorithm_name"]
    )
config = {
    "all_args": all_args,
    "num_agents": num_agents,
    "num_enemies":num_enemies,
    "device": device,
    "run_dir": run_dir
}

*W&B记录训练日志*

In [None]:
if all_args["use_wandb"]:
        # for supercloud when no internet_connection
        # if not connected_to_internet():
        #     import json

        #     # save a json file with your wandb api key in your
        #     # home folder as {'my_wandb_api_key': 'INSERT API HERE'}
        #     # NOTE this is only for running on systems without internet access
        #     # have to run `wandb sync wandb/run_name` to sync logs to wandboard
        #     with open(os.path.dirname(os.path.dirname(os.getcwd())) + "/keys.json") as json_file:
        #         key = json.load(json_file)
        #         my_wandb_api_key = key["my_wandb_api_key"]  # NOTE change here as well
        #     os.environ["WANDB_API_KEY"] = my_wandb_api_key
        #     os.environ["WANDB_MODE"] = "dryrun"
        #     os.environ["WANDB_SAVE_CODE"] = "true"

        # print_box("Creating wandboard...")
        run = wandb.init(
            config=all_args,
            project="simplecq",
            # project=all_args.env_name,
            # entity="cc",
            notes=socket.gethostname(),
            name=str(all_args["algorithm_name"])
            + "_seed"
            + str(all_args["seed"]),
            # group=all_args.scenario_name,
            dir=str(run_dir),
            # job_type="training",
            reinit='return_previous',
        )
        
setproctitle.setproctitle(
        str(all_args["algorithm_name"])
        + "@"
        + str("lapluma030")
    )

# seed
torch.manual_seed(all_args["seed"])
torch.cuda.manual_seed_all(all_args["seed"])
np.random.seed(all_args["seed"])

[34m[1mwandb[0m: (1) Create a W&B account


[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\15961\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mwstybh[0m ([33mwstybh-beihang-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [88]:
# PPO参数设置
state_dim = 18  # 状态维度
action_dim = 6  # 动作维度
lr_actor = 0.0003  # Actor学习率
lr_critic = 0.001  # Critic学习率
gamma = 0.99  # 折扣因子
K_epochs = 5  # 每次更新的训练轮数
eps_clipping = 0.2  # PPO裁剪参数
is_continuous_action_space = False  # 离散动作空间

# 初始化PPO
ppo_agent = PPO(
    state_dim=state_dim,
    action_dim=action_dim,
    lr_actor=lr_actor,
    lr_critic=lr_critic,
    gamma=gamma,
    K_epochs=K_epochs,
    eps_clipping=eps_clipping,
    is_continuous_action_space=is_continuous_action_space
)


ModuleNotFoundError: No module named 'torch._dynamo'

*端口及输出类型指定*

In [None]:
port = 40029
outputs_type = "uint64_t"
# outputs_type = {
#     "AgentCMD1": "uint64_t",
#     "AgentCMD2": "uint64_t"
# }

In [None]:
agent = Agent(port=port, 
              outputs_type=outputs_type,
              process_input=parse_Input,
              process_output=parse_Output,
              reward_func=cal_Reward,
              end_func=cal_Termination)

In [None]:
# s = agent.reset()
# a, b, c, _ = agent.step(0)
# print(s)
# print(a)
# print(b)   

In [None]:
# 定义好奇心模型
class CuriosityModule(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super(CuriosityModule, self).__init__()
        self.state_action_encoder = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU()
        )
        self.state_predictor = nn.Linear(hidden_size, state_dim)

    def forward(self, state, action):
        # 将状态和动作拼接
        state_action = torch.cat([state, action], dim=-1)
        encoded = self.state_action_encoder(state_action)
        predicted_next_state = self.state_predictor(encoded)
        return predicted_next_state

# 初始化好奇心模块
curiosity_model = CuriosityModule(state_dim=18, action_dim=6)
curiosity_optimizer = optim.Adam(curiosity_model.parameters(), lr=1e-4)
mse_loss = nn.MSELoss()

*训练函数*

In [None]:
# 第三部分：修改train函数
def train(env: Agent, episodes, enable_log=True):
    Win = 0
    Lose = 0
    Tie = 0
    # 时间间隔
    time_interval = 20
    decision_interval = 100
    init_val = env.reset()
    print("init_val:", init_val)
    
    global count
    count = 0
    
    for episode in range(episodes):
        beat = 0
        defeat = 0
        episode_rewards = 0
        
        # 清空PPO缓冲区准备新一轮的收集
        ppo_agent.buffer.clear()
        current_state = np.array([0.0, 0.0, 0.0, 11.5, 3.5, 25.0, 
                                  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
                                  0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
        # 收集状态和动作数据
        state_action_pairs = []
        for step in range(episode_length*time_interval):
            # 每隔特定步数执行动作决策
            if (step % (decision_interval-1)) == 0:
                action_idx = ppo_agent.select_action(current_state)                
                # 可用战术列表
                TacticList = [1, 2, 14, 27, 30, 32]
                
                # 执行动作并获取环境反馈
                action = TacticList[action_idx]
                obs, rew, done, _ = env.step(action)
                 # 将新状态转换为tensor
                current_state = outputToTensor(obs)

                os.makedirs("D:/A_code/simplecq/record", exist_ok=True)
                # 记录当前状态current_state和动作action到record路径下的action.txt文件
                with open("D:/A_code/simplecq/record/action.txt", "a") as f:
                    state_str = ' '.join(map(str, current_state))
                    f.write(f"{state_str} {action}\n")
                    # f.write(f"Episode {episode}, Step {step}: Action {TacticList[action_idx]}\n")                
                ###
                # 将当前状态和动作转换为张量
                # 确保 state_tensor 的形状正确
                state_tensor = torch.tensor(current_state, dtype=torch.float32).unsqueeze(0)  # (1, 18)

                # 将 action_idx 转换为独热编码
                action_one_hot = torch.zeros(action_dim, dtype=torch.float32)
                action_one_hot[action_idx] = 1.0
                action_tensor = action_one_hot.unsqueeze(0)  # (1, 6)

                # 拼接后输入模型
                predicted_next_state = curiosity_model(state_tensor, action_tensor)

                # 计算预测误差（好奇心奖励）
                actual_next_state = torch.tensor(current_state, dtype=torch.float32).unsqueeze(0)
                curiosity_reward = mse_loss(predicted_next_state, actual_next_state).item()

                # 将好奇心奖励加入总奖励
                curiosity_weight = 0.1  # 好奇心奖励的权重
                total_reward = rew - curiosity_weight * curiosity_reward

                # 记录奖励
                ppo_agent.buffer.rewards.append(total_reward)
                ###
                # 记录奖励和终止状态到PPO缓冲区
                # ppo_agent.buffer.rewards.append(rew)
                ppo_agent.buffer.is_terminals.append(done)
                
                # episode_rewards += rew
                episode_rewards += total_reward
                
                if done or count > 10000:
                    count = 0
                    env.reset()
                    current_state = np.array([0.0,0.0,0.0, 11.5, 3.5, 25.0, 
                                  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
                                  0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
            else:
                # 执行默认动作
                obs, rew, done, _ = env.step(action=2)

                if done or count > 10000:
                    count = 0
                    env.reset()
                    current_state = np.array([0.0,0.0,0.0, 11.5, 3.5, 25.0, 
                                  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
                                  0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
            state_action_pairs.append((current_state, action_idx))
            # 状态判断
            if obs['PlaneInfo'][0]['entity_state'] == 5:
                defeat = 1
            if obs['TargetMessage']['entity_state'] == 5:
                beat = 1
            count += 1
        
        # 每个回合结束后更新PPO
        ppo_agent.update()

        # 更新好奇心模型
        for state, action in state_action_pairs:
            # 确保 state_tensor 的形状正确
            state_tensor = torch.tensor(current_state, dtype=torch.float32).unsqueeze(0)  # (1, 18)

            # 将 action_idx 转换为独热编码
            action_one_hot = torch.zeros(action_dim, dtype=torch.float32)
            action_one_hot[action_idx] = 1.0
            action_tensor = action_one_hot.unsqueeze(0)  # (1, 6)

            # 拼接后输入模型
            predicted_next_state = curiosity_model(state_tensor, action_tensor)
            actual_next_state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

            # 计算损失并优化
            loss = mse_loss(predicted_next_state, actual_next_state)
            curiosity_optimizer.zero_grad()
            loss.backward()
            curiosity_optimizer.step()
        
        # 更新胜负平记录
        if beat == 1: Win += 1
        elif beat == 0 and defeat == 1: Lose += 1
        else: Tie += 1
        
        # # 保存模型
        # if episode % save_interval == 0 or episode == episodes - 1:
        #     checkpoint_path = f"{str(model_dir)}/ppo_model_{episode}.pth"
        #     ppo_agent.save(checkpoint_path)
        # 保存模型
        if episode % save_interval == 0 or episode == episodes - 1:
            # 确保保存目录存在
            os.makedirs(str(model_dir), exist_ok=True)
            checkpoint_path = f"{str(model_dir)}/ppo_model_{episode}.pth"
            try:
                ppo_agent.save(checkpoint_path)
                print(f"模型已保存到: {checkpoint_path}")
            except Exception as e:
                print(f"保存模型时出错: {e}")
                # 尝试保存到当前目录
                alternative_path = f"./ppo_model_{episode}.pth"
                try:
                    ppo_agent.save(alternative_path)
                    print(f"模型已保存到备用路径: {alternative_path}")
                except Exception as e2:
                    print(f"保存到备用路径也失败: {e2}")
                # 记录信息
        if episode % log_interval == 0:
            print(
                f"平均回合奖励：{episode_rewards:.3f} \t"
                f"总步数：{(episode + 1) * episode_length} \t"
                f"完成度：{(episode + 1) / episodes * 100:.3f}%"
            )
            print(f"训练进度：胜：{Win} 负：{Lose} 平：{Tie-1}")
            
            # 使用wandb记录训练信息
            if all_args["use_wandb"]:
                wandb.log({
                    "episode_reward": episode_rewards,
                    # # 记录policyloss和valueloss
                    # "policy_loss": ppo_agent.policy_loss,
                    # "value_loss": ppo_agent.value_loss,
                    "win_count": Win,
                    "lose_count": Lose,
                    "tie_count": Tie,
                    "win_rate": Win / (episode + 1)
                }, step=(episode + 1) * episode_length)
        
        if episode % 100 == 0 and enable_log:
            print(f"已完成：{episode / episodes * 100}%")
    
    print(f"训练结束。胜：{Win} 负：{Lose} 平：{Tie-1}")
    
    if all_args["use_wandb"]:
        run.finish()

*运行训练*

In [None]:
train(agent, 500)