In [None]:
#sb3中的并行环境包装器
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import DQN
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
import time


import sys
import os
sys.path.append(os.getcwd())
from gym_env_wrapper import create_env #包括 HyFlexEnv以及Reward奖励设计等


# gym保存文件路径
gym_env_save_path = "/home/chaofan/Documents/pyhyflex/hhrl/results/"
# 载入的配置文件路径
gym_env_config_path = '/home/chaofan/Documents/pyhyflex/hhrl/configs/fir_discrete.ini'
# 固定种子参数
gym_env_seed = 7  # 固定种子为 7
savefig_dpi = 200


# 使用的CPU数量
Num_CPU = 8
# 解决问题的类型
Problem_Type = 'BP'
# 解决问题的类型下的实例
Instance_ID = 0
# 每个episode中的步数限制
Iteration_Limit = 2000
# 总步长
TotalTimestep = 1000000



# 输出timestep的时间和最后的绘图
class ProgressCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(ProgressCallback, self).__init__(verbose)
        self.start_time = None
        self.episode_rewards = []

    def _on_training_start(self):
        self.start_time = time.time()
        print("Training started!")

    def _on_step(self):
        if len(self.locals['infos']) > 0:
            for info in self.locals['infos']:
                if 'episode' in info.keys():
                    self.episode_rewards.append(info['episode']['r'])
                    
        if self.num_timesteps % 5000 == 0:
            elapsed_time = time.time() - self.start_time
            print(f"Step: {self.num_timesteps}, Elapsed Time: {elapsed_time:.2f}s")
        return True

    def _on_training_end(self):
        elapsed_time = time.time() - self.start_time
        print(f"Training ended! Total time: {elapsed_time:.2f}s")
        
        # Plot rewards only if there are any
        if self.episode_rewards:
            plt.plot(np.arange(len(self.episode_rewards)), self.episode_rewards)
            plt.xlabel('Episode')
            plt.ylabel('Reward')
            plt.title('Reward Trend')
            # 保存图表在当前目录下
            plt.savefig('reward_trend_50k.png', dpi=savefig_dpi)
            plt.show()
        else:
            print("No episode rewards recorded.")



#使用Monitor以及再次包装环境，以进行并行化运行 (参数必须内置函数内部)
def make_env(problem, instance_id, run_id, iteration_limit, overwrite):
    # 保存文件路径
    _save_path = gym_env_save_path
    # 载入的配置文件路径
    _config_path = gym_env_config_path
    # 固定种子参数
    _seed = gym_env_seed 
    
    def _init():
        env = create_env(problem, 
                         instance_id, 
                         _seed, 
                         run_id, 
                         iteration_limit, 
                         _config_path, 
                         _save_path, 
                         overwrite)
        
        env = Monitor(env)  # 使用Monitor包装环境
        return env

    return _init



if __name__ == "__main__":

    # 创建并行环境
    env = SubprocVecEnv([make_env(Problem_Type, Instance_ID, _i, Iteration_Limit, True) for _i in range(Num_CPU)])
    
    # 创建并训练模型
    model = DQN('MlpPolicy', env, verbose=1)
    
    # 创建进度回调
    progress_callback = ProgressCallback(verbose=1)
    
    # 传递回调函数给learn方法
    model.learn(total_timesteps=TotalTimestep, callback=progress_callback)
    
    # 保存模型
    model.save("dqn_hyflex")
    
    # 关闭环境
    env.close()


Using cpu device
Training started!
Step: 5000, Elapsed Time: 17.63s
Step: 10000, Elapsed Time: 34.40s
Step: 15000, Elapsed Time: 50.92s
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2e+03    |
|    ep_rew_mean      | -0.66    |
|    exploration_rate | 0.848    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 294      |
|    time_elapsed     | 54       |
|    total_timesteps  | 16000    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000252 |
|    n_updates        | 496      |
----------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2e+03    |
|    ep_rew_mean     | -0.66    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 294      |
|    time_elapsed    | 54       |
|    total_timesteps | 16000    |
---------------------------------