### A Toy Example

In [4]:
import sys
import os
import base64
import pickle
import zlib
import gym
import numpy as np
import pandas as pd
import torch as th
from torch import nn, tensor
from collections import deque
from gym.spaces import Box, Discrete
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3.common import results_plotter
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.base_vec_env import VecEnv
# from stable_baselines3.common.policies import BasePolicy, register_policy
import time
from datetime import date
from matplotlib import pyplot as plt
from stable_baselines3 import DQN
import torch
%matplotlib inline

In [5]:
import gymnasium as gym
from gymnasium import spaces

# import gym
# from gym import spaces
import numpy as np

class CustomEnv(gym.Env):
    def __init__(self):
        super(CustomEnv, self).__init__()
        # Define action and observation space
        self.action_space = spaces.Discrete(2)  # Two discrete actions (0 and 1)
        self.observation_space = spaces.Box(low=0, high=1, shape=(3,))  # State space with three continuous variables

        # Initialize state
        self.state = np.array([0.5, 0.5, 0.5])

    def step(self, action):
        # Execute action and update state
        reward = self._calculate_reward(action)
        self.state = self._update_state()

        # Check if episode is done
        done = False  # Define your termination conditions

        return self.state, reward, done, None,  {}

    def reset(self, seed=None):
        # Reset the environment to initial state
        self.state = np.array([0.5, 0.5, 0.5])
        return self.state, None

    def render(self, mode='human'):
        # Define how to visualize the environment
        pass

    def _calculate_reward(self, action):
        # Define your reward function based on the action and current state
        # For example:
        reward = np.sum(self.state) * action
        return reward

    def _update_state(self):
        # Update the state based on the current state and action
        new_state = self.state + np.random.uniform(-0.1, 0.1, size=self.state.shape)
        return np.clip(new_state, 0, 1)  # Clip state within the valid range


In [6]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
import numpy as np

# Create a function to instantiate your custom environment
def create_custom_env():
    return CustomEnv()  # Instantiate your custom environment

# Create a vectorized environment
# env = make_vec_env(create_custom_env, n_envs=4)

In [7]:
from typing import Callable
from stable_baselines3.common.utils import set_random_seed
def make_env(seed: int = 42) -> Callable:
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env = create_custom_env()
        env = Monitor(env, allow_early_resets=True)
        return env
    set_random_seed(seed)
    return _init

In [8]:
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
train_env = SubprocVecEnv([make_env() for i in range(4)])

In [9]:
# Initialize and train the DQN model
model = DQN("MlpPolicy", train_env, verbose=1)

Using cuda device


In [10]:
model.learn(total_timesteps=1000, log_interval=8)

<stable_baselines3.dqn.dqn.DQN at 0x7f51d4295700>

In [11]:
test_env = create_custom_env()
test_env.reset()

(array([0.5, 0.5, 0.5]), None)

In [12]:
# # Evaluate the trained model
# mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=1)

# Use the trained model to interact with the environment
# test_env = make_vec_env(create_custom_env, num_envs=1)
test_env = create_custom_env()
obs = test_env.reset()[0]
# print(f"obs: {obs}; shape: {obs.shape}")
for _ in range(1000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _, info = test_env.step(action)
    print(f"obs: {obs}, reward: {reward}, done: {done}")
    if done:
        obs = test_env.reset()

obs: [0.47490802 0.59014286 0.54639879], reward: 0.0, done: False
obs: [0.49463972 0.52134659 0.47759769], reward: 0.0, done: False
obs: [0.40625644 0.59458182 0.49782069], reward: 0.0, done: False
obs: [0.44787096 0.49869872 0.59180267], reward: 0.0, done: False
obs: [0.51435949 0.44116654 0.52816766], reward: 0.0, done: False
obs: [0.45104039 0.40201499 0.53311894], reward: 0.0, done: False
obs: [0.43742939 0.36026082 0.55548952], reward: 0.0, done: False
obs: [0.36532816 0.31868975 0.52876189], reward: 1.3531797325356179, done: False
obs: [0.35654216 0.37572494 0.46869665], reward: 1.2127798030318084, done: False
obs: [0.35938905 0.39420785 0.37798673], reward: 0.0, done: False
obs: [0.38089802 0.32831268 0.29099705], reward: 0.0, done: False
obs: [0.47067513 0.42143908 0.35267652], reward: 0.0, done: False
obs: [0.43159788 0.34097351 0.38952313], reward: 0.0, done: False
obs: [0.41962838 0.26538115 0.38855851], reward: 1.1620945119266755, done: False
obs: [0.32650608 0.34724523 0.3

### DQN on a gym env

In [3]:
import gymnasium as gym

from stable_baselines3 import DQN

env = gym.make("CartPole-v1")

model = DQN("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=10000, log_interval=4)
# model.save("dqn_cartpole")

# del model # remove to demonstrate saving and loading

# model = DQN.load("dqn_cartpole")

# obs, info = env.reset()
# while True:
#     action, _states = model.predict(obs, deterministic=True)
#     obs, reward, terminated, truncated, info = env.step(action)
#     if terminated or truncated:
#         obs, info = env.reset()

self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env_idx: 0
self.num_envs: 1
env

<stable_baselines3.dqn.dqn.DQN at 0x7f51dc0385b0>