In [1]:
import time
import win32gui
import win32con
import ctypes
from PIL import ImageGrab
import win32com.client
import os
import torch
import cv2
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image

import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

import win32api

In [2]:
ASFW_ANY = -1  # 定义 ASFW_ANY 常量
DWMWA_EXTENDED_FRAME_BOUNDS = 9  # 用于获取窗口的扩展边界

def get_window_pos(name):
    handle = win32gui.FindWindow(0, name)
    # 获取窗口句柄
    if handle == 0:
        return None
    else:
        # 获取窗口扩展边界
        rect = ctypes.wintypes.RECT()
        ctypes.windll.dwmapi.DwmGetWindowAttribute(
            ctypes.wintypes.HWND(handle),
            ctypes.wintypes.DWORD(DWMWA_EXTENDED_FRAME_BOUNDS),
            ctypes.byref(rect),
            ctypes.sizeof(rect)
        )
        return (rect.left, rect.top, rect.right, rect.bottom), handle

def allow_set_foreground_window():
    # 允许当前进程设置前景窗口
    asfw = ctypes.windll.user32.AllowSetForegroundWindow
    asfw(ASFW_ANY)

def fetch_image_first(window_name):
    window_info = get_window_pos(window_name)
    if window_info is None:
        print("未找到指定窗口")
        return None
    
    (x1, y1, x2, y2), handle = window_info
    # 发送还原最小化窗口的信息
    win32gui.SendMessage(handle, win32con.WM_SYSCOMMAND, win32con.SC_RESTORE, 0)
    # 等待窗口恢复
    time.sleep(1)
    # 允许设置前景窗口
    allow_set_foreground_window()
    
    shell = win32com.client.Dispatch("WScript.Shell")
    shell.SendKeys('%')
    # 设为高亮
    win32gui.SetForegroundWindow(handle)
    # 等待窗口响应
    time.sleep(1)
    
    # 检查窗口是否可见
    if not win32gui.IsWindowVisible(handle):
        print("窗口不可见")
        return None

    # 再次获取整个窗口位置（包括标题栏和边框）
    window_info = get_window_pos(window_name)
    if window_info is None:
        print("未找到指定窗口")
        return None
    (x1, y1, x2, y2), handle = window_info
    # 截图
    grab_image = ImageGrab.grab(bbox=(x1, y1, x2, y2))

    return (x1, y1, x2, y2), grab_image

#图像相似判断函数，返回相似的概率
def template_match(template, image):
    # 确保模板和图像都是灰度图并且具有相同的深度
    if template.dtype != image.dtype:
        template = template.astype(image.dtype)
    
    result = cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, _ = cv2.minMaxLoc(result)
    return max_val

def template_matching(target_image, template_image, ori_width, ori_height):
    # 将图像转换为uint8类型
    target_image = (target_image * 255).astype(np.uint8)
    template_image = (template_image * 255).astype(np.uint8)
    
    
    # 原始模板匹配
    result_original = cv2.matchTemplate(target_image, template_image, cv2.TM_CCOEFF_NORMED)
    #获取翻转前匹配程度最高的位置
    _, max_val_original, _, max_loc_original = cv2.minMaxLoc(result_original)
    
    # 翻转后的模板匹配
    template_image_flipped = cv2.flip(template_image, flipCode=1)
    result_flipped = cv2.matchTemplate(target_image, template_image_flipped, cv2.TM_CCOEFF_NORMED)
    
    #获取翻转后匹配程度最高的位置
    _, max_val_flipped, _, max_loc_flipped = cv2.minMaxLoc(result_flipped)
    
    # 根据max_val选择使用哪个匹配结果
    if max_val_original > max_val_flipped:
        max_val = max_val_original
        max_loc = max_loc_original
        template_used = 'original'
    else:
        max_val = max_val_flipped
        max_loc = max_loc_flipped
        template_used = 'flipped'
        template_image = template_image_flipped  # 使用翻转后的模板来可视化
    
    
    h, w = template_image.shape
    top_left = max_loc
    center_point = (top_left[0] + w//2, top_left[1] + h//2)
    
    # 输出匹配结果图像
    matched_image = target_image.copy()
    bottom_right = (top_left[0] + w, top_left[1] + h)
    cv2.rectangle(matched_image, top_left, bottom_right, 255, 2)
    
    return center_point

#返回template图在target图的坐标位置
def transparent_template_matching(target_image, template_image, ori_width, ori_height):
    
    target_image = cv2.cvtColor(np.array(target_image), cv2.COLOR_BGR2GRAY)
    
    template_alpha = template_image[:, :, 3]  # 获取alpha通道
    template_image = cv2.cvtColor(template_image[:, :, :3], cv2.COLOR_BGR2GRAY)
    
    mask =  template_alpha
    
    
    # 原始模板匹配
    result_original = cv2.matchTemplate(target_image, template_image, cv2.TM_CCOEFF_NORMED,mask=mask)
    #获取翻转前匹配程度最高的位置
    _, max_val_original, _, max_loc_original = cv2.minMaxLoc(result_original)
    
    # 翻转后的模板匹配
    template_image_flipped = cv2.flip(template_image, flipCode=1)
    result_flipped = cv2.matchTemplate(target_image, template_image_flipped, cv2.TM_CCOEFF_NORMED)
    
    #获取翻转后匹配程度最高的位置
    _, max_val_flipped, _, max_loc_flipped = cv2.minMaxLoc(result_flipped)
    
    # 根据max_val选择使用哪个匹配结果
    if max_val_original > max_val_flipped:
        max_val = max_val_original
        max_loc = max_loc_original
        template_used = 'original'
    else:
        max_val = max_val_flipped
        max_loc = max_loc_flipped
        template_used = 'flipped'
        template_image = template_image_flipped  # 使用翻转后的模板来可视化
    
    
    h, w = template_image.shape
    top_left = max_loc
    center_point = (top_left[0] + w//2, top_left[1] + h//2)
    
    # 输出匹配结果图像
    matched_image = target_image.copy()
    bottom_right = (top_left[0] + w, top_left[1] + h)
    cv2.rectangle(matched_image, top_left, bottom_right, 255, 2)
    
    return center_point


In [3]:
# 询问用户是否需要加载模型
load_model = 'y'

#第一次抓取并锁定位置
(x1, y1, x2, y2), img = fetch_image_first("test wanna Medium")

#存储胜利截图和死亡截图
#grab_image = ImageGrab.grab(bbox=(x1, y1, x2, y2))
#grab_image.save('victory_template.png')

#记录初始图片的大小和高度
ori_width=x2-x1
ori_height=y2-y1

#获得终点图片
endpoint_image_path = 'endpoint.png'
endpoint_image = Image.open(endpoint_image_path)
endpoint_image = endpoint_image.convert('L')
endpoint_image= np.array(endpoint_image) / 255.0
endpoint_loc=(0,0)


#获得人物图片
character_image_path = 'character.png'
# character_image = Image.open(character_image_path)
# character_image = character_image.convert('L')
# character_image= np.array(character_image) / 255.0
character_image = cv2.imread(character_image_path, cv2.IMREAD_UNCHANGED)
character_loc=(0,0)

# 检测探测位置函数是否好使
# tem_image_path = 'death_template.PNG'
# tem_image = Image.open(tem_image_path)
# tem_image = tem_image.convert('L')
# tem_image= np.array(tem_image) / 255.0

# center_point = template_matching(tem_image, endpoint_image,  ori_width, ori_height)
# print(f"Center point: {center_point}")



In [4]:
import tkinter as tk
from tkinter import filedialog

def select_file_and_return_filename():
    root = tk.Tk()
    root.withdraw()  # 隐藏主窗口

    file_path = filedialog.askopenfilename()  # 打开文件选择对话框并获取所选文件的路径

    if file_path:
        file_name = file_path.split("/")[-1]  # 获取文件名（假设路径使用斜杠分隔）
        return file_name
    else:
        return None  # 如果用户取消选择，则返回None

In [5]:
if load_model=='y':
    model_training_file = select_file_and_return_filename()
    target_training_file = select_file_and_return_filename()
else:
    pass

In [6]:
# # 定义 DQN 模型
# class DQN(nn.Module):
#     def __init__(self, input_shape, num_actions):
#         super(DQN, self).__init__()
#         self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
#         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
#         self.fc1 = nn.Linear(64 * 7 * 7, 512)
#         self.fc2 = nn.Linear(512, num_actions)

#     def forward(self, x):
#         x = torch.relu(self.conv1(x))
#         x = torch.relu(self.conv2(x))
#         x = torch.relu(self.conv3(x))
#         x = x.view(x.size(0), -1)
#         x = torch.relu(self.fc1(x))
#         return self.fc2(x)


import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        
        self.flatten_size = self._get_conv_output(input_shape)
        
        self.fc1 = nn.Linear(self.flatten_size, 512)
        self.bn_fc1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, num_actions)
        
    def _get_conv_output(self, shape):
        batch_size = 1
        input = torch.autograd.Variable(torch.rand(batch_size, *shape))
        output_feat = self._forward_features(input)
        n_size = output_feat.data.view(batch_size, -1).size(1)
        return n_size

    def _forward_features(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return x

    def forward(self, x):
        is_single_input = (x.size(0) == 1)
        if is_single_input:
            self.eval()
        x = self._forward_features(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.bn_fc1(self.fc1(x)))
        x = self.fc2(x)
        if is_single_input:
            self.train()
        return x


# 经验回放缓冲区
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.array(state), action, reward, np.array(next_state), done

    def __len__(self):
        return len(self.buffer)

# 环境交互
def get_state():
    
    img =  ImageGrab.grab(bbox=(x1, y1, x2, y2))
    #在这里更新人物的位置和终点的位置信息
    temp_image = img.convert('L')
    temp_image= np.array(temp_image) / 255.0
    endpoint_loc = template_matching(temp_image, endpoint_image,  ori_width, ori_height)
    character_loc = transparent_template_matching(img, character_image,  ori_width, ori_height)
    
    if img is not None:
        img = img.resize((84, 84)).convert('L')
        return np.array(img) / 255.0,character_loc, endpoint_loc
    
    return None,None,None


# 选择动作函数
def select_action(state, epsilon, mask, num_actions):

    mask = mask.view(1, -1)
    if random.random() > epsilon:
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).unsqueeze(0).cuda()
            q_values = model(state)
            q_values = q_values * mask
            # 只在 mask 为 1 的动作中选取 Q 值最大的动作
            valid_q_values = q_values[mask == 1]
            
            device = mask.device  # 获取mask张量所在的设备（假设在GPU上）
            valid_actions = torch.arange(num_actions, device=device)[mask.view(-1) == 1]
            
            max_valid_action = valid_q_values.argmax().item()
            action = valid_actions[max_valid_action].item()
            return q_values, action
    else:
        masked_actions = mask.view(-1).nonzero(as_tuple=True)[0].tolist()  # 将mask展平为1D，然后获取非零索引

        action = random.choice(masked_actions)  # 随机从有效的动作中选取执行

        return None,action
        
    


# # 测试 get_state 函数
# state, a, b= get_state()

# if state is not None:
#     print(a)
#     print(b)
#     print("State shape:", state.shape)
#     print("State dtype:", state.dtype)
#     img = Image.fromarray((state * 255).astype(np.uint8))  # 转换回图片以查看效果
#     img.show()


bins=16
#全局变量记录所有的状态只记录最新的10000条
visited_states = deque(maxlen=100000)  


# 量化状态函数
def quantize_state(state, bins=16):
    # 将状态量化为指定的bins数量
    state_quantized = np.digitize(state, np.linspace(0, 255, bins))
    return state_quantized

# 计算状态的哈希值
def hash_state(state):
    state_bytes = state.tobytes()
    state_hash = hashlib.sha256(state_bytes).hexdigest()
    return state_hash


def compute_reward(state,character_loc,endpoint_loc, next_state, next_character_loc, next_endpoint_loc):
    # 计算并返回奖励值
    reward = -10
    
    next_state_quantized = quantize_state(next_state, bins)
    next_state_hash = hash_state(next_state_quantized)
    
    if next_state_hash not in visited_states:
        reward += 10  # 探索新状态的奖励
        visited_states.append(next_state_hash) 
    else:
        reward -= 100  # 重复状态的惩罚
    
    
    # 计算距离
    current_distance = np.linalg.norm(np.array(character_loc) - np.array(endpoint_loc))
    next_distance = np.linalg.norm(np.array(next_character_loc) - np.array(next_endpoint_loc))
        
    # 示例：根据某种条件设置奖励
    if check_if_victory(next_state):
        reward += 10000  # 通关奖励
    elif check_if_death(next_state):
        reward -= 10000  # 死亡惩罚
    
    return current_distance,next_distance,reward

def compute_distance_reward(last_d1, d1):
    # 计算移动距离的差值
    distance_diff = last_d1 - d1  # 如果last_d1 > d1，说明靠近目标，应奖励
    
    # 根据差值计算奖励
    if distance_diff > 0:
        # 距离减少，给予正面奖励
        reward = 10 * distance_diff
    elif distance_diff < 0:
        # 距离增加，给予负面奖励
        reward = -10 * abs(distance_diff)
    else:
        # 距离没有变化
        reward = 0
    
    return reward

In [7]:

# 动作定义
ACTIONS = ['left', 'right', 'shift', 're_left','re_right','re_shift','none_op']
KEY_CODES = [0x25, 0x27, 0x10, 0x52]  # 左箭头键、右箭头键、Shift键、'R'键

def press_key(hex_key_code):
    win32api.keybd_event(hex_key_code, 0, 0, 0)

def release_key(hex_key_code):
    win32api.keybd_event(hex_key_code, 0, win32con.KEYEVENTF_KEYUP, 0)


def release_all_keys():
    for key_code in KEY_CODES:
        release_key(key_code)
        
def perform_reset():
    press_key(0x52)  # 'R' 键
    time.sleep(0.1)
    release_key(0x52)
    release_all_keys()  # 释放所有按键
    
def perform_jump():
    press_key(0x10)  # 'SHIFT' 键
    time.sleep(0.5)
    release_key(0x10)
    release_all_keys()  # 释放所有按键
    
def perform_attack():
    press_key(0x2C)  # 'Z' 键
    time.sleep(0.1)
    release_key(0x2C)
    #release_all_keys()  # 释放所有按键
    
# #最开始的想法
# def perform_action(action):
#     # 执行动作
#     if action == 0:  # 左移
#         press_key(0x25)  # 左箭头键
#         time.sleep(0.1)
#         release_key(0x25)
#     elif action == 1:  # 右移
#         press_key(0x27)  # 右箭头键
#         time.sleep(0.1)
#         release_key(0x27)
#     elif action == 2:  # Shift
#         press_key(0x10)  # Shift 键
#         time.sleep(0.1)
#         release_key(0x10)
#     elif action == 3:  # 长按 Shift
#         press_key(0x10)  # Shift 键
#         time.sleep(0.6)
#         release_key(0x10)
   
    
def perform_action(state, character_loc, endpoint_loc,action):
    # 执行动作
    global state_action
    
    if action == 0:  # 按下左键
        press_key(0x25)  # 左箭头键
        time.sleep(0.05)
        state_action[0]=1
    elif action == 1:  # 按下右键
        press_key(0x27)  # 右箭头键
        time.sleep(0.05)
        state_action[1]=1
    elif action == 2:  # 按下Shift键
        press_key(0x10)  # Shift键
        time.sleep(0.05)
        state_action[2]=1
        
    elif action == 3:  # 松开左键
        release_key(0x25)  # 左箭头键
        time.sleep(0.05)
        state_action[0]=0
    elif action == 4:  # 松开右键
        release_key(0x27)  # 右箭头键
        time.sleep(0.05)
        state_action[1]=0
    elif action == 5:  # 松开Shift键
        release_key(0x10)  # Shift键
        time.sleep(0.05)
        state_action[2]=0
    elif action ==6: #不操作
        time.sleep(0.05)
        pass
    
    # 获取下一状态
    next_state, next_character_loc, next_endpoint_loc= get_state()
    
    # 计算奖励 (这里你需要根据你的游戏环境定义奖励)
    
    d1,d2,reward = compute_reward(state,character_loc,endpoint_loc, next_state, next_character_loc, next_endpoint_loc)

    # 判断是否结束 (根据你的游戏逻辑定义终止条件)
    done = check_if_done(next_state, reward)

    return d1,d2,reward, next_state, next_character_loc, next_endpoint_loc,done


def update_mask(state_action, mask, num_actions):
    # 重置 mask
    mask.fill_(0)  # 将所有元素重置为 0

    # 根据 state_action 更新 mask
    if state_action[0] == 1:  # 左键被按下
        mask[0, 3] = 1  # 允许松开左键
        mask[0, 1] = 0  # 禁止按下右键
    else:
        mask[0, 0] = 1  # 允许按下左键
    
    if state_action[1] == 1:  # 右键被按下
        mask[0, 4] = 1  # 允许松开右键
        mask[0, 0] = 0  # 禁止按下左键
    else:
        mask[0, 1] = 1  # 允许按下右键
    
    if state_action[2] == 1:  # Shift键被按下
        mask[0, 5] = 1  # 允许松开Shift键
    else:
        mask[0, 2] = 1  # 允许按下Shift键
        
    # 确保左键和右键不能同时按下
    if state_action[0] == 1 or state_action[1] == 1:
        mask[0, 0] = 0  # 禁止按下左键
        mask[0, 1] = 0  # 禁止按下右键
    mask[0, 6] = 1  # 永远允许不动操作
    return mask

# # 测试 perform_action 函数
# action = 0  # 示例动作
# reward, next_state, done = perform_action(action)
# print("Reward:", reward)
# print("Next state shape:", next_state.shape)
# print("Done:", done)


In [8]:
def check_if_victory(state):
    # 加载胜利状态模板图像并转换为灰度图像
    victory_template = Image.open('victory_template.png').convert('L')
    # 调整模板图像大小以匹配 state 的大小
    victory_template = victory_template.resize((state.shape[1], state.shape[0]))
    resized_victory_template = np.array(victory_template) / 255.0

    match_value = template_match((resized_victory_template * 255).astype(np.uint8), (state * 255).astype(np.uint8))
    return match_value > 0.9  # 设置一个阈值

def check_if_death(state):
    # 加载死亡状态模板图像并转换为灰度图像
    death_template = Image.open('death_template.png').convert('L')
    # 调整模板图像大小以匹配 state 的大小
    death_template = death_template.resize((state.shape[1], state.shape[0]))
    resized_death_template = np.array(death_template) / 255.0
    match_value = template_match((resized_death_template * 255).astype(np.uint8), (state * 255).astype(np.uint8))
    return match_value > 0.95  # 设置一个阈值


def check_if_done(state,reward):
    # 判断是否结束
    # 这里你需要根据你的游戏逻辑定义终止条件
    done = 0
    # 示例：根据某种条件设置结束标志
    if check_if_death(state):
        print("death")
        done = 1
    elif check_if_victory(state):
        print("win")
        done = 2
    return done

In [9]:
import tkinter as tk
import random
import hashlib
root = tk.Tk()
root.title("Action and Reward Display")
root.geometry("500x300+0+0")

# 标签来显示action和reward
action_label_text = tk.Label(root, text="Action: None")
action_label_text.place(x=5, y=10)

reward_label_text = tk.Label(root, text="Reward: None")
reward_label_text.place(x=5, y=40)

# 标签来显示mask和state_action
mask_label_text = tk.Label(root, text="Mask: None")
mask_label_text.place(x=5, y=70)

state_action_label_text = tk.Label(root, text="State Action: None")
state_action_label_text.place(x=5, y=100)

# 标签来显示Q values
qvalues_label_text = tk.Label(root, text="Q Values: None")
qvalues_label_text.place(x=5, y=130)

distance_label_text = tk.Label(root, text="Last Disatance: None")
distance_label_text.place(x=5, y=160)

distance2_label_text = tk.Label(root, text="Update Disatance: None")
distance2_label_text.place(x=5, y=190)

# 动作标签列表
action_labels = ["按←",  "按→", "按↑", "松←", "松→","松↑","停□"]


# 更新显示的函数

# def update_display(action, reward):
#     action_label_text.config(text=f"Action: {action_labels[action]}")
#     reward_label_text.config(text=f"Reward: {reward:.2f}")
#     root.update()


def update_display(action, reward, mask, state_action,qvalues,distance,d2):
    action_label_text.config(text=f"Action: {action_labels[action]}")
    reward_label_text.config(text=f"Reward: {reward:.2f}")
    mask_label_text.config(text=f"Mask: {mask.cpu().numpy()}")
    state_action_label_text.config(text=f"State Action: {state_action}")
    if qvalues== None:
        qvalues_label_text.config(text=f"Q Values: None")
    else:
        qvalues_label_text.config(text=f"Q Values: {np.array2string(qvalues.cpu().numpy(), precision=2, separator=',')}")
    
    distance_label_text.config(text=f"Last Disatance: {distance}")
    distance2_label_text.config(text=f"Update Disatance: {d2}")
        
    root.update()

In [None]:

# 超参数
num_actions = 7
capacity = 10000
batch_size = 32
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 500
learning_rate = 0.01

# 初始化
input_shape = (1, 84, 84)
model = DQN(input_shape, num_actions).cuda()
target_model = DQN(input_shape, num_actions).cuda()
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 使用 StepLR 调度器动态调整学习率
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.95)


replay_buffer = ReplayBuffer(capacity)


if load_model.lower() == 'y':
    # 加载模型参数
    if os.path.exists(model_training_file):
        model.load_state_dict(torch.load(model_training_file))
        model.eval()
    else:
        print(f"Model parameters file '{model_training_file}' not found.")
        
    if os.path.exists(target_training_file):
        target_model.load_state_dict(torch.load(target_training_file))
        target_model.eval()
    else:
        print(f"Model parameters file '{target_training_file}' not found.")
else:
    print("Model parameters not loaded.")

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

import re
import os    
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'


# 设置扰动方向的范围
delta_range = 3.0
num_points = 50
deltas = np.linspace(-delta_range, delta_range, num_points)

# 初始化奖励数据
rewards = np.zeros((num_points, num_points))

In [None]:
epsilon=0.01

In [None]:

# 执行实验并收集数据
for i, delta1 in enumerate(deltas):
    for j, delta2 in enumerate(deltas):
        perturbed_model = model.state_dict()
        for name, param in model.named_parameters():
            # 在CUDA上生成与参数相同形状的扰动
            direction1 = torch.randn_like(param).cuda()
            direction2 = torch.randn_like(param).cuda()
            direction1 /= torch.norm(direction1)
            direction2 /= torch.norm(direction2)
            
            # 将扰动添加到参数中
            perturbation = delta1 * direction1 + delta2 * direction2
            perturbed_model[name].copy_(param + perturbation)
        model.load_state_dict(perturbed_model)
        
        # 运行多个实验取平均奖励
        total_reward = 0
        num_episodes = 5
        for _ in range(num_episodes):
            #重新初始化
            state_action=[0,0,0,0]
            mask = torch.ones(1, num_actions).cuda()  # 初始化 mask
            time.sleep(1)
            perform_reset()
            state, character_loc, endpoint_loc = get_state()
        
            total_reward = 0
            last_d1=-1
            
            done = False
            while not done:
                mask = update_mask(state_action, mask, num_actions)  # Update mask based on the action taken
            
                qvalues,action = select_action(state, epsilon, mask, num_actions)
            
                # 执行动作并获取奖励
                d1,d2,reward, state, next_character_loc, next_endpoint_loc, done = perform_action(state, character_loc, endpoint_loc, action)
                
                total_reward += reward
                
                
                
                
        rewards[i, j] = total_reward / num_episodes

# 绘制奖励面
X, Y = np.meshgrid(deltas, deltas)
Z = rewards

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, Z, cmap='viridis')
ax.set_xlabel('Delta 1')
ax.set_ylabel('Delta 2')
ax.set_zlabel('Mean Reward')
plt.show()
