## Eigene Umgebung

Es kann eine Klasse definiert werden, welche die Oberklasse gym.Env hat

Diese Klasse benötigt die Methoden für Umgebungen (\_\_init\_\_(), step(), reset(), ...)

Sie benötigt außerdem Action Space und Observation Space

In [14]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box
import numpy as np

### Discrete

Ein einzelner Wert von X bis Y

In [24]:
d = Discrete(3, start=-1)

In [27]:
d.sample()

np.int64(0)

### Box

Bereich von X bis Y

Standardmäßig float32, kann angepasst werden

Kann auch mehrere Werte zurückgeben (size=)

In [21]:
b = Box(0, 100, dtype=np.int8, shape=(10, 3))

In [22]:
b.sample()

array([[35, 15, 58],
       [17, 32, 81],
       [ 2, 19, 41],
       [84, 26, 81],
       [58,  4, 97],
       [24, 18, 13],
       [69, 59, 54],
       [48,  1, 52],
       [ 2, 98, 99],
       [ 2, 86, 84]], dtype=int8)

### CarDriver

Soll eine Geschwindigkeit um 50km/h halten

Kann über Actions beschleunigen, bremsen oder nichts tun

In [99]:
class CarDriver(gym.Env):
    def __init__(self):
        self.action_space = Discrete(3)  # 0: Bremsen, 1: Nichts Tun, 2: Beschleunigen
        self.observation_space = Box(0, 100, dtype=np.int8)  # 0-100km/h möglich
        self.state = 50  # Startgeschwindigkeit
        self.end_time = 100  # Maximale Dauer einer Umgebung (auch Maximalpunktzahl)
        self.time_now = 0

    
    def step(self, action):
        self.state += (action - 1) * 3

        reward = 0
        if self.state >= 45 and self.state <= 55:
            reward += 1

        self.state += (np.random.randint(5) - 2)  # Random Noise
        self.time_now += 1

        if self.state < 0:
            self.state = 0
        if self.state > 100:
            self.state = 100

        return np.array([self.state], dtype=np.int8), \
                reward, \
                self.time_now >= self.end_time, \
                self.time_now >= self.end_time, \
                {"V": self.state}

    
    def reset(self, seed=0):
        self.state = 50
        self.time_now = 0
        return (np.array([self.state], dtype=np.int8), {})

### check_env

In [100]:
env = CarDriver()

In [101]:
from stable_baselines3.common.env_checker import check_env

In [102]:
check_env(env)

### Eigene Umgebung Testen

In [78]:
anz = 10
for x in range(anz):
    state = env.reset()  # Startzustand
    score = 0  # Reward Summe
    done = False
    velocities = []
    
    while not done:
        action = env.action_space.sample()
        state, reward, done, _, info = env.step(action)
        score += reward
        velocities.append(int(info["V"]))
    
    print(f"Punkte: {score}, {velocities}")

Punkte: 21, [50, 51, 50, 51, 55, 55, 54, 51, 52, 47, 47, 42, 41, 43, 40, 39, 38, 42, 39, 38, 40, 38, 40, 39, 39, 44, 47, 43, 45, 45, 47, 50, 48, 43, 44, 43, 42, 44, 44, 42, 39, 34, 30, 30, 33, 33, 31, 30, 25, 23, 22, 21, 22, 22, 27, 27, 26, 22, 20, 21, 23, 20, 22, 21, 25, 30, 27, 22, 27, 26, 24, 26, 24, 26, 28, 23, 27, 32, 33, 34, 38, 36, 35, 34, 29, 28, 32, 31, 29, 30, 34, 35, 37, 39, 41, 43, 42, 45, 47, 45]
Punkte: 15, [50, 54, 56, 57, 56, 55, 54, 55, 59, 57, 55, 55, 60, 63, 63, 60, 58, 59, 54, 55, 55, 60, 56, 58, 59, 64, 65, 66, 62, 67, 70, 72, 69, 71, 72, 70, 70, 71, 71, 67, 69, 66, 65, 63, 63, 62, 63, 65, 69, 70, 70, 69, 70, 68, 69, 71, 73, 71, 74, 78, 82, 81, 76, 74, 74, 72, 74, 78, 76, 71, 73, 74, 75, 73, 72, 67, 66, 61, 64, 59, 54, 55, 55, 56, 51, 55, 57, 58, 63, 64, 65, 60, 58, 59, 57, 59, 58, 57, 56, 56]
Punkte: 33, [53, 52, 47, 49, 51, 46, 43, 41, 36, 34, 38, 36, 35, 38, 38, 43, 47, 48, 53, 49, 45, 46, 45, 42, 42, 47, 49, 48, 47, 46, 43, 46, 44, 46, 48, 50, 52, 53, 58, 57, 5

### PPO

In [105]:
from stable_baselines3 import PPO
env = CarDriver()

In [106]:
model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [107]:
model.learn(total_timesteps=100_000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | 19.6     |
| time/              |          |
|    fps             | 2366     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 21.8        |
| time/                   |             |
|    fps                  | 1426        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012212638 |
|    clip_fraction        | 0.0895      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.000634    |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x1b79643b0b0>

In [108]:
model.save("Models/CarDriver")

In [109]:
env = CarDriver()
model = PPO.load("Models/CarDriver", env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [111]:
anz = 10
for x in range(anz):
    state = env.reset()[0]  # Startzustand
    score = 0  # Reward Summe
    done = False
    velocities = []
    
    while not done:
        action = model.predict(state)
        state, reward, done, _, info = env.step(action[0])
        score += reward
        velocities.append(int(info["V"]))
    
    print(f"Punkte: {score},\n {velocities}")

Punkte: 92,
 [52, 50, 45, 43, 48, 44, 45, 46, 50, 49, 52, 53, 49, 49, 51, 56, 54, 56, 56, 55, 53, 54, 56, 56, 51, 53, 55, 54, 55, 55, 57, 59, 54, 51, 52, 50, 48, 47, 51, 49, 51, 52, 52, 53, 55, 51, 53, 55, 53, 49, 54, 52, 48, 51, 52, 53, 55, 50, 54, 52, 54, 52, 49, 47, 47, 48, 50, 51, 49, 54, 52, 51, 48, 47, 49, 54, 54, 51, 48, 46, 46, 46, 45, 43, 45, 45, 46, 46, 45, 43, 45, 50, 50, 49, 47, 52, 53, 50, 50, 48]
Punkte: 90,
 [48, 43, 46, 48, 48, 51, 53, 49, 48, 52, 54, 58, 56, 52, 48, 53, 54, 55, 51, 48, 45, 49, 53, 50, 49, 48, 51, 46, 44, 48, 51, 49, 50, 51, 49, 49, 47, 48, 47, 49, 48, 51, 50, 51, 50, 49, 48, 47, 50, 52, 51, 52, 57, 55, 54, 51, 56, 51, 54, 53, 52, 54, 49, 50, 51, 50, 48, 48, 50, 45, 44, 42, 41, 44, 46, 45, 44, 44, 46, 48, 50, 54, 55, 55, 54, 57, 53, 54, 52, 53, 55, 53, 55, 52, 54, 54, 54, 49, 51, 50]
Punkte: 91,
 [52, 52, 50, 49, 50, 48, 44, 44, 46, 48, 50, 51, 50, 48, 46, 47, 47, 48, 50, 48, 44, 43, 47, 43, 47, 48, 46, 47, 48, 47, 49, 45, 46, 48, 50, 50, 48, 47, 51, 53