## Eigene Umgebung

In [41]:
import gymnasium as gym
import numpy as np
from gymnasium.spaces import Discrete, Box
from typing import Optional

### Typen von Spaces

Action/Observation Space haben jeweils einen Typen

In [5]:
Discrete(3)  # Einzelne Zahl von 0 bis X

Discrete(3)

In [21]:
b = Box(0, 10, dtype=np.float32, shape=(4,))

In [22]:
b.sample()

array([2.11341  , 5.20718  , 7.8557796, 5.66286  ], dtype=float32)

### Eigene Umgebung definieren

Autofahrer

Soll immer zw. 45km/h und 55km/h fahren

Drei Actions: Bremsen, Beschleunigen, Nichts

Observations: Derzeitige Geschwindigkeit

Wenn die AktV >= 45 <= 50 ist, soll eine Belohnung ausgeschüttet werden

State: Derzeitige Geschwindigkeit

In [82]:
class CarDriver(gym.Env):
    def __init__(self):
        self.action_space = Discrete(3)  # 3 Aktionen, 0: Nichts tun, 1: Bremsen, 2: Beschleunigen
        self.observation_space = Box(0, 100, dtype=np.int8)
        self.state = 50
        self.driveLength = 60
    
    def step(self, action):
        if action == 1:
            self.state -= 3
        elif action == 2:
            self.state += 3

        # Random Noise
        self.state += np.random.randint(3) - 1

        if self.state >= 45 and self.state <= 55:
            reward = 1
        else:
            reward = 0

        self.driveLength -= 1

        return np.array([self.state], dtype=np.int8), reward, self.driveLength <= 0, False, {}

    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        self.state = 50
        self.driveLength = 60
        return (np.array([self.state], dtype=np.int8), {})

In [83]:
from stable_baselines3.common.env_checker import check_env

In [84]:
env = CarDriver()

In [85]:
check_env(env, warn=True)

### Testen

In [87]:
d = 5
for x in range(d):
    start = env.reset()
    score = 0
    done = False
    
    while not done:
        # env.render()  # Umgebung zeichnen
        action = env.action_space.sample()  # Generiere eine Random Action
        newState, reward, done, term, info = env.step(action)  # Bei einer Action kommen 5 Werte zurück: Neuer State, Belohnung, Fertig?, Mach Ende?, Info
        score += reward
    print(f"Durchgang: {x}, Score: {score}")
# env.close()

Durchgang: 0, Score: 5
Durchgang: 1, Score: 54
Durchgang: 2, Score: 16
Durchgang: 3, Score: 9
Durchgang: 4, Score: 2


### Lernen/Trainieren

In [89]:
from stable_baselines3 import PPO

In [88]:
env = CarDriver()

In [90]:
model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [91]:
model.learn(total_timesteps=20_000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | 23.4     |
| time/              |          |
|    fps             | 667      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | 22.8        |
| time/                   |             |
|    fps                  | 317         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008014955 |
|    clip_fraction        | 0.0276      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.0012     |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x2ce3853c8c0>

In [92]:
model.learn(total_timesteps=20_000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | 37.5     |
| time/              |          |
|    fps             | 805      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | 39.1        |
| time/                   |             |
|    fps                  | 383         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008983763 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.06       |
|    explained_variance   | 0.0459      |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x2ce3853c8c0>

In [93]:
model.learn(total_timesteps=50_000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | 49       |
| time/              |          |
|    fps             | 581      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | 49.2        |
| time/                   |             |
|    fps                  | 303         |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013218502 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.04       |
|    explained_variance   | 0.0114      |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x2ce3853c8c0>

In [94]:
model.save("Models/CarDriver")

In [99]:
d = 5
for x in range(d):
    state = env.reset()  # [ 0.0492511 ,  0.04087935, -0.04937328, -0.00469788] (Startpos, StartV, StartPolePos, StartPoleV)
    score = 0
    done = False
    
    while not done:
        # env.render()  # Umgebung zeichnen
        if len(state) == 2:
            action, _ = model.predict(state[0])  # Generiere eine Random Action
        else:
            action, _ = model.predict(state)
        state, reward, done, term, info = env.step(action)  # Bei einer Action kommen 5 Werte zurück: Neuer State, Belohnung, Fertig?, Mach Ende?, Info
        score += reward
    print(f"Durchgang: {x}, Score: {score}")
# env.close()

Durchgang: 0, Score: 52
Durchgang: 1, Score: 37
Durchgang: 2, Score: 48
Durchgang: 3, Score: 58
Durchgang: 4, Score: 46
