In [5]:
import sys
import os
sys.path.append(os.getcwd())
from gym_env_wrapper import create_env
#import gym
import gymnasium as gym
from stable_baselines3.common.vec_env import SubprocVecEnv
import stable_baselines3 as sb3
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3.common.callbacks import BaseCallback
import time


save_path = "/home/chaofan/Documents/pyhyflex/hhrl/results/"
config_path = '/home/chaofan/Documents/pyhyflex/hhrl/configs/fir_discrete.ini'

class ProgressCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(ProgressCallback, self).__init__(verbose)
        self.start_time = None
        self.episode_rewards = []

    def _on_training_start(self):
        self.start_time = time.time()
        print("Training started!")

    def _on_step(self):
        if len(self.locals['infos']) > 0:
            for info in self.locals['infos']:
                if 'episode' in info.keys():
                    self.episode_rewards.append(info['episode']['r'])
                    
        if self.num_timesteps % 200 == 0:
            elapsed_time = time.time() - self.start_time
            print(f"Step: {self.num_timesteps}, Elapsed Time: {elapsed_time:.2f}s")
        return True

    def _on_training_end(self):
        elapsed_time = time.time() - self.start_time
        print(f"Training ended! Total time: {elapsed_time:.2f}s")
        
        # Plot rewards only if there are any
        if self.episode_rewards:
            plt.plot(np.arange(len(self.episode_rewards)), self.episode_rewards)
            plt.xlabel('Episode')
            plt.ylabel('Reward')
            plt.title('Training Rewards')
            plt.show()
        else:
            print("No episode rewards recorded.")


###make_env###

def make_env(problem, instance_id, run_id, iteration_limit, overwrite):
    seed = 7  # 固定种子为 7
    def _init():
        env = create_env(problem, instance_id, seed, run_id, iteration_limit, config_path, save_path, overwrite)
        return env

    return _init


if __name__ == "__main__":
    num_cpu = 6  # 使用的CPU数量

    # 创建并行环境
    env = SubprocVecEnv([make_env('BP', 0, i, 10000, True) for i in range(num_cpu)])
    
    # 创建并训练模型
    model = sb3.DQN('MlpPolicy', env, verbose=1)
    
    # 创建进度回调
    progress_callback = ProgressCallback(verbose=1)
    
    # 传递回调函数给learn方法
    model.learn(total_timesteps=10000, callback=progress_callback)
    
    # 保存模型
    model.save("dqn_hyflex")
    
    # 关闭环境
    env.close()


Using cpu device
Training started!
Step: 600, Elapsed Time: 6.55s
Step: 1200, Elapsed Time: 13.42s
Step: 1800, Elapsed Time: 21.73s
Step: 2400, Elapsed Time: 30.89s
Step: 3000, Elapsed Time: 41.29s
Step: 3600, Elapsed Time: 53.07s
Step: 4200, Elapsed Time: 65.96s
Step: 4800, Elapsed Time: 81.19s
Step: 5400, Elapsed Time: 97.14s
Step: 6000, Elapsed Time: 112.77s
Step: 6600, Elapsed Time: 128.47s
Step: 7200, Elapsed Time: 144.13s
Step: 7800, Elapsed Time: 159.71s
Step: 8400, Elapsed Time: 175.58s
Step: 9000, Elapsed Time: 191.27s
Step: 9600, Elapsed Time: 207.05s
Training ended! Total time: 217.73s
No episode rewards recorded.
