In [1]:
import jsbsim
import sys
import gymnasium as gym
sys.modules["gym"] = gym
import jsbgym
import os
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [2]:
env = gym.make("JSBSim-TurnHeadingControlTask-Cessna172P-Shaping.STANDARD-NoFG-v0")

In [3]:
env.reset()

(array([ 5.00000000e+03,  1.07084362e-16, -1.06330882e-16,  2.02536000e+02,
         3.55271368e-15,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00, -3.72529030e-09,  9.43703968e-16,  2.25220484e+01,
         2.99000000e+02]),
 {})

In [4]:
for episode in range(1, 6):
    obs, info = env.reset()
    done = False
    total_reward = 0
    while not done:
        obs, reward, done, _, info = env.step(env.action_space.sample())
        total_reward += reward
    print("Total Reward for episode {} is {}".format(episode, total_reward))

Total Reward for episode 1 is 19.75422110790321
Total Reward for episode 2 is 39.387855508833574
Total Reward for episode 3 is 37.840249550576715
Total Reward for episode 4 is 25.846688918784846
Total Reward for episode 5 is 45.14785127328967


In [5]:
class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(
                self.save_path, "best_model_{}".format(self.n_calls)
            )
            self.model.save(model_path)

        return True

In [6]:
CHECKPOINT_DIR = "./train/"
LOG_DIR = "./logs/"

In [7]:
callback = TrainAndLoggingCallback(check_freq=1000, save_path=CHECKPOINT_DIR)

In [8]:
model = PPO("MlpPolicy", env, tensorboard_log=LOG_DIR)

In [9]:
model.learn(total_timesteps=10000000, callback=callback)
model.save("JSBSim_10000000_steps")

In [10]:
model = PPO.load(os.path.join("train", "best_model_2924000"))

In [11]:
for episode in range(1, 6):
    obs, _ = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, _, info = env.step(action)
        total_reward += reward
    print("Total Reward for episode {} is {}".format(episode, total_reward))

Total Reward for episode 1 is 122.69739363645263
Total Reward for episode 2 is 94.89967540935955
Total Reward for episode 3 is 116.5924219555808
Total Reward for episode 4 is 120.19779394401789
Total Reward for episode 5 is 121.30385169788168


In [12]:
env.close()