In [1]:
import os
import numpy as np
import gym
import random
import stable_baselines3
from stable_baselines3 import DQN
import matplotlib.pyplot as plt
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback

In [2]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq:
    :param log_dir: Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: Verbosity level.
    """
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print(f"Num timesteps: {self.num_timesteps}")
                print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print(f"Saving new best model to {self.save_path}")
                  self.model.save(self.save_path)

        return True

In [3]:
#check_env(env,warn= True, skip_render_check=True)

In [4]:
log_dir = "/tmp/gym/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
env = gym.make("gym_basic:basic-v0")
env = Monitor(env, log_dir)

In [5]:
# generate the model by DQN
model = DQN("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env in a DummyVecEnv.


In [6]:
#callback
callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir)

In [7]:
model.learn(total_timesteps=1000, callback = callback)
# save the model
model.save("dqn_facts")

[3 1 2 2 7 1 7]
[3 2 3 1 2 2 7]
[1 3 3 3 4 2 4]
[ 3  1  1  2  2 10  1]
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | -0.362   |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4        |
|    time_elapsed     | 0        |
|    total_timesteps  | 4        |
----------------------------------
[3 2 1 2 2 3 4]
[ 2  3  3  2  1 10 10]
[2 1 2 1 3 8 4]
[ 3  1  1  2 10  8  1]
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | -0.199   |
|    exploration_rate | 0.924    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3        |
|    time_elapsed     | 2        |
|    total_timesteps  | 8        |
----------------------------------
[3 2 3 3 9 3 5]
[1 1 3 3 1 4 5]
[ 2  2  1  1  2  8 10]
[3 1 2 3 1 4 8]
----------

In [8]:
'''
timesteps = 1000
plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "gym_basic:basic-v0")
plt.show()
'''

'\ntimesteps = 1000\nplot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "gym_basic:basic-v0")\nplt.show()\n'

In [9]:

obs = env.reset()

#check how the model runs
for i in range(100):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done :
      obs = env.reset()


[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3

In [10]:

mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward_after:.2f} +/- std_reward:{std_reward_after:.2f}")


[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3 6 7 4]
[1 3 1 3

In [11]:
print(mean_reward_after)

-0.15248


In [12]:
#results_plotter.plot_results([log_dir], 100, results_plotter.X_TIMESTEPS, "facts_gym")
#Monitor.get_episode_rewards(env)

In [13]:
#Monitor.get_episode_times(env)

In [15]:
#simple check
'''
episodes = 1000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
'''

"\nepisodes = 1000\nfor episode in range(1, episodes+1):\n    state = env.reset()\n    done = False\n    score = 0 \n    \n    while not done:\n        #env.render()\n        action = env.action_space.sample()\n        n_state, reward, done, info = env.step(action)\n        score+=reward\n    print('Episode:{} Score:{}'.format(episode, score))\n"