In [None]:
from ray.tune.registry import register_env
from glucose_env import CustomGlucoseDynamicsEnv
import ray
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.algorithms.impala import ImpalaConfig

def env_creator(env_config):
    """
    Returns a new instance of CustomGlucoseDynamicsEnv.
    """
    return CustomGlucoseDynamicsEnv()

env_name = "CustomGlucoseDynamicsEnv-v0"
register_env(env_name, env_creator)

ray.init()

num_of_iter = 8 

print("--- Starting Combined RL Training ---")

# PPO Training
print("\n--- Training PPO (Default Hyperparameters) ---")
ppo_config = (
    PPOConfig()
    .environment(env_name)
    .resources(num_gpus=1) 
    .framework("torch")
    .env_runners(num_env_runners=2) 
    .training(
        lr=0.00005, 
        train_batch_size_per_learner=4000, 
        num_epochs=30, 
        entropy_coeff=0.0,
        gamma=0.99, 
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_duration=num_of_iter,
    )
)
ppo_algo = ppo_config.build()
for i in range(num_of_iter):
    result = ppo_algo.train()
    print(f"PPO Iteration {i} done")
ppo_algo.stop()


KeyboardInterrupt: 

In [None]:
# DQN (Double Q-learning) Training Needs to be CHANGED
print("\n--- Training DQN (Default Hyperparameters) ---")
dqn_config = (
    DQNConfig()
    .environment(env_name)
    .resources(num_gpus=1) 
    .framework("torch")
    .env_runners(num_env_runners=2)
    .training(
        lr=0.00005,
        train_batch_size=30,
        double_q=True,
    ) 
    .evaluation(
        evaluation_interval=1,
        evaluation_duration=num_of_iter,
    )
)
dqn_algo = dqn_config.build()
for i in range(num_of_iter):
    result = dqn_algo.train()
    print(f"DQN Iteration {i} done")
dqn_algo.stop()

In [None]:
# IMPALA Training 
print("\n--- Training IMPALA (Default Hyperparameters) ---")
impala_config = (
    ImpalaConfig()
    .environment(env_name)
    .resources(
        num_gpus=1, 
        num_learner_workers=1 
    )
    .framework("torch")
    .env_runners(num_env_runners=2) 
    .evaluation(
        evaluation_interval=1,
        evaluation_duration=num_of_iter,
    )
)
impala_algo = impala_config.build()
for i in range(num_of_iter):
    result = impala_algo.train()
    print(f"IMPALA Iteration {i} done")
impala_algo.stop()

In [None]:
# Recurrent PPO Training
print("\n--- Training Recurrent PPO (Default Hyperparameters) ---")
recurrent_ppo_config = (
    PPOConfig() 
    .environment(env_name)
    .resources(num_gpus=1) 
    .framework("torch")
    .env_runners(num_env_runners=2) 
    .training(
        lr=0.00005,
        train_batch_size_per_learner=4000, 
        num_epochs=30,
        entropy_coeff=0.0, 
        gamma=0.99, 
        model={
            "use_lstm": True, 
            "lstm_cell_size": 256, 
            "max_seq_len": 20, 
        },
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_duration=num_of_iter,
    )
)
recurrent_ppo_algo = recurrent_ppo_config.build()
for i in range(num_of_iter):
    result = recurrent_ppo_algo.train()
    print(f"Recurrent PPO Iteration {i} done")
recurrent_ppo_algo.stop()

ray.shutdown()
print("\n--- All training complete ---")