In [1]:
import jnius_config
jnius_config.set_classpath('.', 'hyflex/*')
from jnius import autoclass

In [2]:
import random
import re
import json
import pickle
import configparser
import time
from itertools import count

In [3]:
# 定义 Solution 类
class Solution:
    def __init__(self, id, solution, fitness):
        self.id = id
        self.solution = solution
        self.fitness = fitness

    def __len__(self):
        return len(self.solution)

    def __eq__(self, other):
        return self.id == other.id

    def __ne__(self, other):
        return not self.__eq__(other)

    def __lt__(self, other):
        return (self.fitness, self.id) < (other.fitness, other.id)

    def __le__(self, other):
        return self.fitness <= other.fitness

    def __gt__(self, other):
        return (self.fitness, self.id) > (other.fitness, other.id)

    def __ge__(self, other):
        return self.fitness >= other.fitness

    def copy(self):
        return copy.deepcopy(self)

    def compare(self, other):
        return self.solution == other.solution

    def distance(self, other):
        return 0

# 定义 ListSolution 类
class ListSolution(Solution):
    def __init__(self, id=0, solution=[], fitness=float('inf')):
        super().__init__(id, solution, fitness)

    def __str__(self):
        return str(self.solution)

    def __len__(self):
        return len(self.solution)

    def distance(self, other):
        diff = [1 if a != b else 0 for a, b in zip(self.solution, other.solution)]
        diff.extend([1] * abs(len(self) - len(other)))
        return np.mean(diff)

    def generate_random(self, n=10):
        self.solution = tuple(np.random.permutation(n))

# 定义 StatsInfo 类
class StatsInfo:
    def __init__(self, initial_fitness):
        self.fitness_hist = []  # 记录每次迭代的当前适应度值
        self.best_fitness_hist = []  # 记录每次迭代的最佳适应度值
        self.heuristic_hist = []  # 记录应用的启发式方法
        self.reward_hist = []  # 记录每次迭代的奖励值
        self.best_solution = None  # 最佳解决方案
        self.run_id = 0  # 运行标识符
        self.run_time = 0.0  # 运行时间
        self.initial_fitness = initial_fitness  # 初始适应度值
        self.best_fitness = None  # 最佳适应度值
        self.iterations = 0  # 迭代次数
        self.state_hist = []  # 记录每次迭代的状态

    def __str__(self):
        return str(self.best_fitness)

    def push_heuristic(self, heuristic, reward, state=None):
        self.heuristic_hist.append(heuristic)
        self.reward_hist.append(reward)
        if state:
            self.state_hist.append(state)

    def push_fitness(self, current, best):
        self.fitness_hist.append(current)
        self.best_fitness_hist.append(best)

    def save(self, outdir='.', save_csv=False):
        filepath = f'{outdir}/{self.run_id}.dat'
        pickle.dump(self, open(filepath, 'wb'))
        if save_csv:
            self.save_csv(outdir)

    def save_csv(self, outdir='.'):
        filename = 'fitness_history'
        history = self.best_fitness_hist
        initial = self.initial_fitness
        open_flag = 'w'
        if os.path.isfile(f'{outdir}/{filename}.csv'):
            open_flag = 'a'
        with open(f'{outdir}/{filename}.csv', open_flag, newline='') as evol_file:
            w = csv.writer(evol_file, delimiter=';')
            if open_flag == 'w':
                w.writerow(('run', 'iter', 'fitness'))
                w.writerow((self.run_id, 0, initial))
            for it, fitness in enumerate(history):
                line = (self.run_id, it+1, fitness)
                w.writerow(line)

                
# 定义 HyperHeuristic 类
class HyperHeuristic:
    def __init__(self, problem, agent, credit_assignment, acceptance):
        self.problem = problem
        self.agent = agent
        self.credit_assignment = credit_assignment
        self.acceptance = acceptance

    def __elapsed_time(self):
        self.elapsed = time.process_time() - self.start_time
        return self.elapsed

    def run(self, time_limit=3):
        self.problem.initialise_solution()
        current_fitness = self.problem.get_fitness()
        iterations = 0
        stats = StatsInfo(current_fitness)
        stats.push_fitness(current_fitness, current_fitness)
        self.start_time = time.process_time()
        while self.__elapsed_time() < time_limit:
            llh = self.agent.select()
            fitness = self.problem.apply_heuristic(llh)
            delta = current_fitness - fitness
            reward = self.credit_assignment.get_reward(llh, fitness, current_fitness)
            if self.acceptance.is_solution_accepted(delta):
                self.problem.accept_solution()
                current_fitness = fitness
            self.agent.update(action=llh, reward=reward, solution=self.problem.get_solution(), elapsed=self.elapsed)
            stats.push_fitness(current_fitness, self.problem.get_best_fitness())
            stats.push_heuristic(llh, reward, self.agent.get_env_state())
            iterations += 1
        stats.best_fitness = self.problem.get_best_fitness()
        stats.run_time = self.elapsed
        stats.iterations = iterations
        return stats

# 定义 AcceptAll 类
class AcceptAll:
    def is_solution_accepted(self, *args):
        return True

# 定义 RawImprovementPenalty 类
class RawImprovementPenalty:
    def __init__(self, config, actions, *args):
        pass

    def get_reward(self, action, new_fitness, past_fitness, *args):
        fir = (past_fitness - new_fitness) / past_fitness
        return fir

    def reset(self):
        pass

# 定义 StateBuilder 类
class StateBuilder:
    def __init__(self, state_classes, config, **kwargs):
        self.states = [state_cls(config, **kwargs) for state_cls in state_classes]

    def reset(self):
        for state_obj in self.states:
            state_obj.reset()

    def get_state(self):
        state = []
        for state_obj in self.states:
            state.extend(state_obj.get_state())
        return state

    def update(self, **kwargs):
        for state_obj in self.states:
            state_obj.update(**kwargs)

# 定义 FitnessImprovementRate 类
class FitnessImprovementRate:
    def __init__(self, config, **kwargs):
        self.discrete = config['FIR'].getboolean('discrete', False)
        self.fir = 0
        self.last_fitness = None

    def reset(self):
        self.fir = 0
        self.last_fitness = None

    def _get_discrete_state(self):
        if self.fir > 0:
            return 1
        elif self.fir == 0:
            return 0
        else:
            return -1

    def get_state(self):
        if self.discrete:
            return [self._get_discrete_state()]
        return [self.fir]

    def update(self, solution, **kwargs):
        if self.last_fitness is not None:
            self.fir = (self.last_fitness - solution.fitness) / self.last_fitness
        self.last_fitness = solution.fitness

# 定义 ElapsedTime 类
class ElapsedTime:
    def __init__(self, config, time_limit, **kwargs):
        self.time_limit = time_limit
        self.elapsed = 0

    def reset(self):
        pass

    def get_state(self):
        return [self.elapsed / self.time_limit]

    def update(self, elapsed, **kwargs):
        self.elapsed = elapsed

# 定义 Agent 类
class Agent:
    def __init__(self, actions, policy):
        self.actions = actions
        self.policy = policy

    def reset(self):
        raise NotImplementedError

    def select(self):
        action_idx = self.policy.select(self)
        return self.actions[action_idx]

    def get_env_state(self):
        return None

    def update(self, **kwargs):
        raise NotImplementedError

# 定义 RandomAgent 类
class RandomAgent(Agent):
    def __init__(self, config, actions, state_env, prior=[], **kwargs):
        super().__init__(actions, RoulettePolicy(config))
        self.prior = prior
        n_actions = len(actions)
        if len(prior) != n_actions:
            self.prior = [float(1/n_actions)] * n_actions
        self.value_estimates = self.prior
        self.state_env = state_env
        self.state = self.state_env.get_state()

    def __str__(self):
        return f'Random Selection'

    def reset(self):
        self.value_estimates = self.prior

    def get_env_state(self):
        return self.state

    def update(self, action, reward, solution, elapsed):
        self.state_env.update(action=action, 
                              reward=reward, 
                              solution=solution, 
                              elapsed=elapsed)
        self.state = self.state_env.get_state()

# 定义 RoulettePolicy 类
class RoulettePolicy:
    def __init__(self, config):
        pass

    def __str__(self):
        return f'Roulette Wheel'

    def select(self, agent):
        sample = range(len(agent.actions))
        return random.choices(sample, weights=agent.value_estimates)[0]

# 定义 HyFlexDomain 类
class HyFlexDomain:
    solution_indexer = count(1)

    def __init__(self, problem_str, instance_id, seed):
        with open(f'hyflex/problems_json/{problem_str}.json', 'r') as json_file:
            self.problem_dict = json.load(json_file)
        ProblemClass = autoclass(self.problem_dict['class'])
        self.problem = ProblemClass(seed)
        self.problem.loadInstance(instance_id)
        try:
            self.instance_name = self.problem_dict['instances'][str(instance_id)]
        except KeyError:
            self.instance_name = f'id_{instance_id}'
        self.actions = self.problem_dict['actions']

    def initialise_solution(self, idx=0):
        self.problem.initialiseSolution(idx)

    def get_fitness(self, idx=0):
        return self.problem.getFunctionValue(idx)

    def apply_heuristic(self, llh, src_idx=0, dest_idx=1):
        return self.problem.applyHeuristic(int(llh), int(src_idx), int(dest_idx))

    def accept_solution(self, src_idx=1, dest_idx=0):
        self.problem.copySolution(src_idx, dest_idx)

    def get_best_fitness(self):
        return self.problem.getBestSolutionValue()

    def get_solution(self, idx=0):
        solution_str = self.problem.solutionToString(idx)
        id = next(self.solution_indexer)
        return Solution(id, solution_str, self.get_fitness(idx))

# 定义 BinPacking 类
class BinPacking(HyFlexDomain):
    re_bin_items = re.compile(r'(\d+\.0, )')

    def __init__(self, instance_id, seed):
        super().__init__('BP', instance_id, seed)

    def get_solution(self, idx=0):
        solution_str = self.problem.solutionToString(idx)
        sorted_bins = []
        for bin in solution_str.split('\n')[:-2]:
            items = [float(it.strip('[, ]')) for it in re.findall(self.re_bin_items, bin)]
            sorted_bins.append(sorted(items))
        sorted_bins.sort()
        fitness = self.get_fitness(idx)
        id = next(self.solution_indexer)
        return ListSolution(id, sorted_bins, fitness)


# 定义 TSP 类
class TravelingSalesman(HyFlexDomain):
    def __init__(self, instance_id, seed):
        HyFlexDomain.__init__(self, 'TSP', instance_id, seed)

    def get_solution(self, idx=0):
        solution_str = self.problem.solutionToString(idx)
        solution_str = solution_str.split('\n')[1].strip()
        permutation = tuple((int(x) for x in solution_str.split(' ')))
        fitness = self.get_fitness(idx)
        id = next(self.solution_indexer)
        return ListSolution(id, permutation, fitness)
    

_output_path_dir = "/home/Chaofan_Tu/Documents/hyflex3/hhrl/results/"

# 定义代理对象的字典，包含了各种代理的名称和对应的类
agent_dict = {
        'RAND': RandomAgent,
        }
# 定义奖励对象的字典，包含了各种奖励的名称和对应的类
reward_dict = {
        'RIP': RawImprovementPenalty,
        }
# 定义状态对象的字典，包含了各种状态的名称和对应的类
state_dict = {
        'S7': [FitnessImprovementRate, ElapsedTime],
        }
# 定义接受对象的字典，包含了各种接受对象的名称和对应的类
acceptance_dict = {
    'ALL': AcceptAll,
        }
# 定义问题对象的字典，包含了各种问题的名称和对应的类
domain_dict = {
        'TSP': TravelingSalesman,
        'BP': BinPacking,
        }


In [4]:
seed = random.randint(0, 10000)
seed = 1501
print(seed)
instance_id = 0

if __name__ == "__main__":
    with open(f'hyflex/problems_json/BP.json', 'r') as json_file:
        self_problem_dict = json.load(json_file)
    print(f'self_problem_dict: {self_problem_dict}')


1501
self_problem_dict: {'class': 'BinPacking.BinPacking', 'actions': [0, 1, 2, 3, 4, 5, 6], 'instances': {'0': 'falkenauer/falk1000-1', '1': 'falkenauer/falk1000-2', '2': 'schoenfield/schoenfieldhard1', '3': 'schoenfield/schoenfieldhard2', '4': '2000/10-30/instance1', '5': '2000/10-30/instance2', '6': 'trip1002/instance1', '7': 'trip2004/instance1', '8': 'testdual4/binpack0', '9': 'testdual7/binpack0'}}


In [11]:
import gym
#import gymnasium as gym
import numpy as np

problem_str = 'BP'
instance_id = 0
seed = random.randint(0, 10000)
problemjson_path = 'hyflex/problems_json'


class HyflexEnv(gym.Env):
    def __init__(self, 
                 problem_str, 
                 instance_id, 
                 seed=seed, 
                 problemjson_path=problemjson_path):
        super(HyflexEnv, self).__init__()
        
        self.problem_str = problem_str
        self.instance_id = instance_id
        self.seed = seed
        self.problemjson_path = problemjson_path
        self.steps = 0
        
        # 加载问题字典
        with open(f'{self.problemjson_path}/{self.problem_str}.json', 'r') as json_file:
            self.problem_dict = json.load(json_file)
        
        self.n_actions = self.problem_dict['actions']
        
        # 定义动作空间和观察空间
        self.action_space = gym.spaces.Discrete(len(self.n_actions))
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,))

        
    def reset(self):
        # 重置环境，重新初始化问题实例
        self.problem_instance = domain_dict[self.problem_str](self.instance_id, self.seed)
        self.problem_instance.initialise_solution()
        
        # 更新当前适应度和最佳适应度
        self.current_fitness = self.problem_instance.get_fitness()
        self.best_fitness = self.current_fitness
        
        # 重置步数
        self.steps = 0
        self.done = False
        self._seed = 0
        
        # 返回初始观察值
        return np.array([self.current_fitness])
        
    def step(self, llh):
        # 应用动作并计算新的适应度
        new_fitness = self.problem_instance.apply_heuristic(self.n_actions[llh])
        
        # 计算适应度变化量
        delta = new_fitness - self.current_fitness
        
        #print(delta)
        
        # 更新当前适应度
        self.current_fitness = new_fitness
        
        # 如果新适应度优于最佳适应度，则接受新解
        if new_fitness > self.best_fitness:
            self.best_fitness = new_fitness
            self.problem_instance.accept_solution()
        
        # 判断是否达到终止条件
        if self.steps > 10000:
            self.done = True
        
        self.steps += 1
        if self.steps % 1000 == 0:
            print(f"self.steps:{self.steps}")
        
        # 返回新的观察值、奖励、是否结束和其他信息
        return np.array([self.current_fitness]), delta, self.done, {}
    
    def render(self, mode='human'):
        # 可视化环境
        pass
    
    def close(self):
        # 清理环境
        pass
    



env = HyflexEnv('BP', 0)


In [12]:
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv

if __name__ == "__main__":
    
    # 创建环境
    env = HyflexEnv('BP', 0)
    env = DummyVecEnv([lambda: env])
    
    # 配置DQN模型
    model = DQN("MlpPolicy", env, verbose=1)
    
    # 训练模型
    model.learn(total_timesteps=50000)
    
    # 保存模型
    model.save("hyflex_dqn.zip")
    
    # 加载模型
    model = DQN.load("hyflex_dqn.zip", env=env)
    
    # 评估模型
    total_reward = 0
    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
    
    print(f"Step reward: {reward}, Total reward: {total_reward}")
    print(f"Current Fitness: {obs}")
    print("END")
    
    # 关闭环境
    env.close()



Using cuda device
self.steps:1000
self.steps:2000
self.steps:3000
self.steps:4000
self.steps:5000
self.steps:6000
self.steps:7000
self.steps:8000
self.steps:9000
self.steps:10000
self.steps:1000
self.steps:2000
self.steps:3000
self.steps:4000
self.steps:5000
self.steps:6000
self.steps:7000
self.steps:8000
self.steps:9000
self.steps:10000
self.steps:1000
self.steps:2000
self.steps:3000
self.steps:4000
self.steps:5000
self.steps:6000
self.steps:7000
self.steps:8000
self.steps:9000
self.steps:10000
self.steps:1000
self.steps:2000
self.steps:3000
self.steps:4000
self.steps:5000
self.steps:6000
self.steps:7000
self.steps:8000
self.steps:9000
self.steps:10000
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 357      |
|    time_elapsed     | 111      |
|    total_timesteps  | 40008    |
| train/              |          |
|    learning_rate    |