## Snake Modell

In [7]:
import gymnasium as gym
import numpy as np
import Snake
from gymnasium.spaces import Discrete, Box
from typing import Optional

In [20]:
class SnakePlayer(gym.Env):
    def __init__(self):
        self.game = Snake.SnakeGame()
        self.action_space = Discrete(4)
        self.observation_space = Box(0, 100, shape=(self.game.gridSize[0], self.game.gridSize[1]), dtype=np.int64)
        self.state = self.game.grid
        self.lastScore = 0
        
    def step(self, action):
        self.game.nextDirection = action

        self.game.step()

        reward = 0
        if self.game.score > self.lastScore:
            reward = 5
            self.lastScore = self.game.score

        self.state = self.game.grid

        return np.array(self.state), reward, self.game.death, False, {"Steps": self.game.steps, "Score": self.game.score}
    
    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None,):
        if hasattr(self.game, "window"):
            try:
                self.game.window.destroy()
            except: pass
        
        self.game = Snake.SnakeGame()
        self.state = self.game.grid
        self.lastScore = 0
        return (np.array(self.state), {})
        
    def render(self):
        self.game.setupUI()
        self.game.window.update()
        pass
        
    def close(self):
        pass

    def rightDirection(self):
        game = self.game
        if game.headPosX > game.foodPos[1] and game.direction == 0:
            return True
        elif game.headPosX < game.foodPos[1] and game.direction == 2:
            return True
        elif game.headPosX == game.foodPos[1]:
            if game.headPosY > game.foodPos[0] and game.direction == 1:
                return True
            elif game.headPosY < game.foodPos[0] and game.direction == 3:
                return True
        return False

In [21]:
env = SnakePlayer()

In [22]:
durchgänge = 5  # Anzahl der Durchgänge
for i in range(durchgänge):  # Teste die Umgebung X mal
    state = env.reset()  # Setze die Umgebung für den nächsten Durchlauf zurück
    done = False  # Brich den Prozess ab, wenn z.B. die Zeit abläuft
    score = 0

    while not done:
        # env.render()  # Zeichne die Umgebung
        action = env.action_space.sample()  # Nimmt ein Random Element aus dem Action Space (Links oder Rechts)
        state, reward, done, term, info = env.step(action)  # state: Observation Space, reward: Belohnung, done: Fertig Ja/Nein, term: Termination (vorzeitiges Beenden), info: Extra Infos
        score += reward
    print(f"Durchgang {i}, Score: {score}, Info: {info}")
env.close()

Durchgang 0, Score: 0, Info: {'Steps': 49, 'Score': 0}
Durchgang 1, Score: 0, Info: {'Steps': 44, 'Score': 0}
Durchgang 2, Score: 0, Info: {'Steps': 114, 'Score': 0}
Durchgang 3, Score: 0, Info: {'Steps': 45, 'Score': 0}
Durchgang 4, Score: 0, Info: {'Steps': 28, 'Score': 0}


In [23]:
from stable_baselines3.common.env_checker import check_env

In [24]:
check_env(env, warn=True)



In [25]:
from stable_baselines3 import PPO

In [26]:
env = SnakePlayer()

In [30]:
model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [31]:
model.learn(total_timesteps=100000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 49.7     |
|    ep_rew_mean     | 0.366    |
| time/              |          |
|    fps             | 539      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 50.8        |
|    ep_rew_mean          | 0.312       |
| time/                   |             |
|    fps                  | 378         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020792702 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -1.01       |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x13efec26960>

In [32]:
model.save("Models/Snake")

In [33]:
env = SnakePlayer()

In [34]:
model = PPO.load("Models/Snake", env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [35]:
durchgaenge = 10  # Anzahl Durchgänge der Umgebung
for d in range(durchgaenge):  # Teste die Umgebung X mal
    observation = env.reset()  # Setzt die Umgebung zurück und gibt einige Werte zurück
    done = False  # Umgebungen können Zeitlimitiert sein (hier 200ms)
    score = 0
    
    while not done:
        env.render()  # Zeichne die Umgebung (falls möglich)
        if len(observation) == 2:
            observation = observation[0]
        action, _ = model.predict(observation)  # 2 Parameter: Action, next State (in wiederkehrenden Modellen)
        observation, reward, done, term, info = env.step(action)
        score += reward
    print(f"Durchgang: {d}, Score: {score}, Steps: {info}")
env.close()

Durchgang: 0, Score: 0, Steps: {'Steps': 53, 'Score': 0}
Durchgang: 1, Score: 0, Steps: {'Steps': 465, 'Score': 0}
Durchgang: 2, Score: 0, Steps: {'Steps': 508, 'Score': 0}
Durchgang: 3, Score: 0, Steps: {'Steps': 190, 'Score': 0}
Durchgang: 4, Score: 5, Steps: {'Steps': 130, 'Score': 1}
Durchgang: 5, Score: 5, Steps: {'Steps': 369, 'Score': 1}


KeyboardInterrupt: 