In [1]:
import json
import gymnasium as gym
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import time

In [2]:
# Load the arguments from args.json
with open('args.json', 'r') as f:
    args = json.load(f)
scenarios = args['scenarios']
# scenarios

In [3]:
total_timesteps = int(5e3)

scenarios_to_run = [
    1,
    # 2,
    # 3,
    # 4
]

In [4]:
# Function to create a video demonstration of the model
def create_video(scenario_id: int, model: str) -> None:
    if model not in ['ppo', 'dqn']:
        raise ValueError("Model must be either 'ppo' or 'dqn'")

    # Load the environment
    video_env = gym.make(f"scenario_{scenario_id}_env", render_mode='rgb_array')

    video_model = PPO.load(f"./trained_models/scenario_{scenario_id}_ppo") if model == 'ppo' else DQN.load(f"./trained_models/scenario_{scenario_id}_dqn")

    obs, _ = video_env.reset()
    done = truncated = False
    obs, info = video_env.reset()
    for _ in range(1000):
        action, _ = video_model.predict(obs, deterministic=True)
        # print(action)
        obs, reward, done, truncated, info = video_env.step(action)
        video_env.render()
        time.sleep(1/20)  # Add a delay to achieve 30 fps
        if done:
            obs, _ = video_env.reset()
    
    # close video
    video_env.close()
    

In [5]:
# Train PPO and DQN models for each scenario
for scenario in scenarios:
    if scenario['id'] in scenarios_to_run:
        # Clear the action.txt log file
        with open('action.txt', 'w') as f:
            f.write("")
        scenario_id = scenario['id']
        scenario_name = scenario['name']
        env_config = scenario['config']
        env_id = f"scenario_{scenario_id}_env"

        # Register the environment
        gym.envs.register(
            id=env_id,
            entry_point=scenario['env']['entry_point'],
            kwargs={'config': env_config}
        )

        #env = gym.make(env_id, render_mode='rgb_array')
        env = gym.make(env_id, render_mode='human')
        obs, info = env.reset()
        env = Monitor(env)  # Wrap the environment with Monitor

        # Create and train PPO model
        ppo_model = PPO("MlpPolicy",
                        env,
                        policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])]),
                        n_steps=32 * 12 // 6,
                        batch_size=32,
                        n_epochs=10,
                        learning_rate=5e-4,
                        gamma=0.8,
                        verbose=2,
                        tensorboard_log=f"./logs/scenario_{scenario_id}_ppo/",
                        device='cuda')

        ppo_model.learn(total_timesteps)
        ppo_model.save(f"./trained_models/scenario_{scenario_id}_ppo")
        


        # Clear the action.txt log file
        with open('action.txt', 'w') as f:
            f.write("")

        # Create and train DQN model
        dqn_model = DQN("MlpPolicy",
                        env,
                        policy_kwargs=dict(net_arch=[256, 256]),
                        learning_rate=5e-4,
                        buffer_size=15000,
                        learning_starts=200,
                        batch_size=32,
                        gamma=0.8,
                        train_freq=1,
                        gradient_steps=1,
                        target_update_interval=50,
                        verbose=1,
                        tensorboard_log=f"./logs/scenario_{scenario_id}_dqn/",
                        device='cuda')

        dqn_model.learn(total_timesteps)
        dqn_model.save(f"./trained_models/scenario_{scenario_id}_dqn")

        # Evaluate the models
        ppo_mean_reward, ppo_std_reward = evaluate_policy(ppo_model, env, n_eval_episodes=10)
        dqn_mean_reward, dqn_std_reward = evaluate_policy(dqn_model, env, n_eval_episodes=10)

        print(f"Scenario {scenario_id} - PPO Average reward: {ppo_mean_reward} ± {ppo_std_reward}")
        print(f"Scenario {scenario_id} - DQN Average reward: {dqn_mean_reward} ± {dqn_std_reward}")

        # Close the environment
        env.close()

speed ->  25.0
idle % ->  0
speed ->  25.0
idle % ->  0
Using cuda device
Wrapping the env in a DummyVecEnv.
speed ->  25.0
idle % ->  0
Logging to ./logs/scenario_1_dqn/DQN_7
speed ->  25.0
idle % ->  0
reward ->  0.05
speed ->  25.0
idle % ->  0
speed ->  25.0
idle % ->  0
reward ->  0.05
speed ->  25.0
idle % ->  0
speed ->  25.0
idle % ->  0
reward ->  0.05
speed ->  25.0
idle % ->  0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  29.1455588268693
idle % ->  0.75
reward ->  0.15395558826869302
sp

In [6]:
create_video(1, 'ppo')

"create_video(1, 'ppo')"

In [7]:
create_video(1, 'dqn')

speed ->  25.0
idle % ->  0
speed ->  25.0
idle % ->  0
speed ->  25.0
idle % ->  0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  25.0
idle % ->  1.0
reward ->  0.05
speed ->  25.0
idle % ->  1.0
speed ->  20.8544411731307
idle % ->  0.8333333333333334
reward ->  0.09187774506464033
speed ->  20.8544411731307
idle % ->  0.8333333333333334
speed ->  20.146013943668194
idle % ->  0.8571428571428571
reward ->  0.09074585372239621
speed ->  20.146013943668194
idle % ->  0.8571428571428571
speed ->  20.024952065064255
idle % ->  0.875
reward ->  0.09399952065064256
speed ->  20.024952065064255
idle % ->  0.875
speed ->  20.004264014349108
idle % ->  0.8888888888888888
reward ->  0.09726486236571329
speed ->  20.004264014349108
