### Gymnasium
- API for single agent RL environments
- contains common environments (MDPs)
- 5 functions: 
    - make: creates the environment, inbuilt or custom
    - reset: resets the game to the initial state. (e.g. Chess at the start)
    - step: given the state, playing an action, observing the reward and transitioning to new state
        - begins from initial state, until termination / truncation i.e game completion
        - then reset() moves back to initial state
    - render: how env should be visualized
    - close: delete memory
- Environment objects: expected input and output
    - action spaces: 
        - types: Box (n-d continous, bounded), Discrete (0,1...N), Dict, Tuple, MultiBinary, Multidiscrete 
        - .sample() gets you a random action
    - observation space (observed state)
        - types: Box (n-d continous, bounded), Discrete (0,1...N), Dict, Tuple, MultiBinary, Multidiscrete 
    - reward_range
- wrappers: modifies state. 
    - FlattenObservation, TimeLimit (to steps), ClipAction, RescaleAction, TimeAwareObs (ensure Markov)

### Vectorized

In [None]:
import gymnasium as gym

from stable_baselines3 import PPO
env = gym.make("CartPole-v1")
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render()
    # VecEnv resets automatically
    # if done:
    #   obs = env.reset()

env.close()

#### Lunar Lander

In [None]:
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset(seed=42)
for _ in range(1000):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        observation, info = env.reset()
env.close()

#### Cartpole

In [None]:
import gymnasium as gym
from stable_baselines3 import A2C
env = gym.make("CartPole-v1", render_mode="human")
model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render("human")
    # VecEnv resets automatically
    # if done:
    #   obs = vec_env.reset()

#### Mountain Car

In [5]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
import os
import time

In [6]:
# Saving logs to visulise in Tensorboard, saving models
models_dir = f"models/Mountain-{time.time()}"
logdir = f"logs/Mountain-{time.time()}"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [10]:
# Parallel environments
env = make_vec_env("MountainCarContinuous-v0", n_envs=1)

# The learning agent and hyperparameters
model = PPO(
    policy=MlpPolicy,
    env=env,
    seed=0,
    batch_size=256,
    ent_coef=0.00429,
    learning_rate=7.77e-05,
    n_epochs=10,
    n_steps=8,
    gae_lambda=0.9,
    gamma=0.9999,
    clip_range=0.1,
    max_grad_norm =5,
    vf_coef=0.19,
    use_sde=True,
    policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
    verbose=1,
    tensorboard_log=logdir
    )


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)


In [11]:
#Training and saving models along the way
TIMESTEPS = 20000
for i in range(10): 
    model.learn(total_timesteps=TIMESTEPS,reset_num_timesteps=False, tb_log_name="PPO")
    model.save(f"{models_dir}/{TIMESTEPS*i}")

Logging to logs/Mountain-1694368241.423321/PPO_0
----------------------------
| time/              |     |
|    fps             | 385 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 8   |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 145          |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 16           |
| train/                  |              |
|    approx_kl            | 7.599592e-07 |
|    clip_fraction        | 0.2          |
|    clip_range           | 0.1          |
|    entropy_loss         | 1.38         |
|    explained_variance   | -0.158       |
|    learning_rate        | 7.77e-05     |
|    loss                 | 0.00516      |
|    n_updates            | 10           |
|    policy_gradient_loss | 5.52e-05     |
|    std                  | 0.0373       |
|    value_loss  

-----------------------------------------
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 12          |
|    time_elapsed         | 0           |
|    total_timesteps      | 96          |
| train/                  |             |
|    approx_kl            | 0.005145505 |
|    clip_fraction        | 0.1         |
|    clip_range           | 0.1         |
|    entropy_loss         | 1.4         |
|    explained_variance   | -11.8       |
|    learning_rate        | 7.77e-05    |
|    loss                 | 0.00751     |
|    n_updates            | 110         |
|    policy_gradient_loss | 0.000172    |
|    std                  | 0.0373      |
|    value_loss           | 2.34e-06    |
-----------------------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 123           |
|    iterations           | 13            |
|    time_elapsed         

-----------------------------------------
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 22          |
|    time_elapsed         | 1           |
|    total_timesteps      | 176         |
| train/                  |             |
|    approx_kl            | 0.003798671 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.1         |
|    entropy_loss         | 1.42        |
|    explained_variance   | -0.0832     |
|    learning_rate        | 7.77e-05    |
|    loss                 | 0.00569     |
|    n_updates            | 210         |
|    policy_gradient_loss | 0.00109     |
|    std                  | 0.0374      |
|    value_loss           | 3.11e-05    |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 23          |
|    time_elapsed         | 1     

------------------------------------------
| time/                   |              |
|    fps                  | 107          |
|    iterations           | 33           |
|    time_elapsed         | 2            |
|    total_timesteps      | 264          |
| train/                  |              |
|    approx_kl            | 0.0018309355 |
|    clip_fraction        | 0            |
|    clip_range           | 0.1          |
|    entropy_loss         | 1.39         |
|    explained_variance   | 0.0998       |
|    learning_rate        | 7.77e-05     |
|    loss                 | -0.00139     |
|    n_updates            | 320          |
|    policy_gradient_loss | -0.00287     |
|    std                  | 0.0374       |
|    value_loss           | 7.42e-05     |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 106          |
|    iterations           | 34           |
|    time_e

------------------------------------------
| time/                   |              |
|    fps                  | 98           |
|    iterations           | 44           |
|    time_elapsed         | 3            |
|    total_timesteps      | 352          |
| train/                  |              |
|    approx_kl            | 0.0035120696 |
|    clip_fraction        | 0            |
|    clip_range           | 0.1          |
|    entropy_loss         | 1.36         |
|    explained_variance   | 0.0176       |
|    learning_rate        | 7.77e-05     |
|    loss                 | 0.00553      |
|    n_updates            | 430          |
|    policy_gradient_loss | -0.000123    |
|    std                  | 0.0375       |
|    value_loss           | 0.000194     |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 98           |
|    iterations           | 45           |
|    time_e

------------------------------------------
| time/                   |              |
|    fps                  | 100          |
|    iterations           | 55           |
|    time_elapsed         | 4            |
|    total_timesteps      | 440          |
| train/                  |              |
|    approx_kl            | 0.0015937537 |
|    clip_fraction        | 0            |
|    clip_range           | 0.1          |
|    entropy_loss         | 1.4          |
|    explained_variance   | 0.87         |
|    learning_rate        | 7.77e-05     |
|    loss                 | 0.0047       |
|    n_updates            | 540          |
|    policy_gradient_loss | 0.000167     |
|    std                  | 0.0376       |
|    value_loss           | 1.73e-06     |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 100          |
|    iterations           | 56           |
|    time_e

-------------------------------------------
| time/                   |               |
|    fps                  | 100           |
|    iterations           | 66            |
|    time_elapsed         | 5             |
|    total_timesteps      | 528           |
| train/                  |               |
|    approx_kl            | 0.00021889806 |
|    clip_fraction        | 0             |
|    clip_range           | 0.1           |
|    entropy_loss         | 1.43          |
|    explained_variance   | -7.38         |
|    learning_rate        | 7.77e-05      |
|    loss                 | 0.00554       |
|    n_updates            | 650           |
|    policy_gradient_loss | -0.000272     |
|    std                  | 0.0376        |
|    value_loss           | 2.44e-05      |
-------------------------------------------


KeyboardInterrupt: 

In [7]:
import gym
import random
from keras import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear

import numpy as np
env = gym.make('MountainCar-v0')
#env.seed(110)
np.random.seed(10)


class DQN:

    """ Implementation of deep q learning algorithm """

    def __init__(self, action_space, state_space):
        self.action_space = action_space
        self.state_space = state_space
        self.epsilon = 1.0
        self.gamma = .95
        self.batch_size = 64
        self.epsilon_min = .01
        self.lr = 0.001
        self.epsilon_decay = .995
        self.memory = deque(maxlen=100000)
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_space, activation=relu))
        model.add(Dense(25, activation=relu))
        model.add(Dense(self.action_space, activation=linear))
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        targets = rewards + self.gamma*(np.amax(self.model.predict_on_batch(next_states), axis=1))*(1-dones)
        targets_full = self.model.predict_on_batch(states)
        ind = np.array([i for i in range(self.batch_size)])
        targets_full[[ind], [actions]] = targets
        self.model.fit(states, targets_full, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


def get_reward(state):
    if state[0] >= 0.5:
        print("Car has reached the goal")
        return 10
    if state[0] > -0.4:
        return (1+state[0])**2
    return 0


def train_dqn(episode):
    loss = []
    agent = DQN(env.action_space.n, env.observation_space.shape[0])
    for e in range(episode):
        state = env.reset()
        state = np.reshape(state, (1, 2))
        score = 0
        max_steps = 1000
        for i in range(max_steps):
            action = agent.act(state)
            env.render()
            #print(env.step(action))
            next_state, reward, done, _, _ = env.step(action)
            reward = get_reward(next_state)
            score += reward
            next_state = np.reshape(next_state, (1, 2))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        loss.append(score)
    return loss


def random_policy(episode, step):
    for i_episode in range(episode):
        env.reset()
        for t in range(step):
            env.render()
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t+1))
                break
            print("Starting next episode")


if __name__ == '__main__':
    print(env.observation_space)
    print(env.action_space)
    episodes = 60
    loss = train_dqn(episodes)
    plt.plot([i+1 for i in range(episodes)], loss)
    plt.show()

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Discrete(3)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [5]:
import os
import gym
import torch
from tensorboardX import SummaryWriter
from easydict import EasyDict

from ding.config import compile_config
from ding.worker import BaseLearner, SampleSerialCollector, InteractionSerialEvaluator, AdvancedReplayBuffer
from ding.envs import BaseEnvManager, DingEnvWrapper
from ding.policy import DQNPolicy
from ding.model import DQN
from ding.utils import set_pkg_seed
from ding.rl_utils import get_epsilon_greedy_fn
from dizoo.box2d.lunarlander.config.lunarlander_dqn_config import main_config, create_config

# Get DI-engine form env class
def wrapped_cartpole_env():
    return DingEnvWrapper(
        gym.make(main_config['env']['env_id']),
        EasyDict(env_wrapper='default'),
    )


def main(cfg, seed=0):
    cfg['exp_name'] = 'lunarlander_dqn_eval'
    cfg = compile_config(
        cfg,
        BaseEnvManager,
        DQNPolicy,
        BaseLearner,
        SampleSerialCollector,
        InteractionSerialEvaluator,
        AdvancedReplayBuffer,
        save_cfg=True
    )
    cfg.policy.load_path = './final.pth.tar'

    # build multiple environments and use env_manager to manage them
    evaluator_env_num = cfg.env.evaluator_env_num
    evaluator_env = BaseEnvManager(env_fn=[wrapped_cartpole_env for _ in range(evaluator_env_num)], cfg=cfg.env.manager)

    # switch save replay interface
    # evaluator_env.enable_save_replay(cfg.env.replay_path)
    evaluator_env.enable_save_replay(replay_path='./lunarlander_dqn_eval/video')

    # Set random seed for all package and instance
    evaluator_env.seed(seed, dynamic_seed=False)
    set_pkg_seed(seed, use_cuda=cfg.policy.cuda)

    # Set up RL Policy
    model = DQN(**cfg.policy.model)
    policy = DQNPolicy(cfg.policy, model=model)
    policy.eval_mode.load_state_dict(torch.load(cfg.policy.load_path, map_location='cpu'))

    # Evaluate
    tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial'))
    evaluator = InteractionSerialEvaluator(
        cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name
    )
    evaluator.eval()

if __name__ == "__main__":
    main(main_config)

  def __init__(
  def __init__(
  def __init__(
  def __init__(
  def __init__(
  def __init__(
  def __init__(


RuntimeError: Env 0 step has exceeded max retries(1), and the latest exception is: too many values to unpack (expected 4)