## Snake Modell

In [6]:
import gymnasium as gym
import numpy as np
import Snake
from gymnasium.spaces import Discrete, Box, Dict, Sequence
from typing import Optional

In [14]:
class SnakePlayer(gym.Env):
    def __init__(self):
        self.action_space = Discrete(4)  # Die verschiedenen Aktionen (Links, Oben, Rechts, Unten)
        self.game = Snake.SnakeGame()
        self.observation_space = Box(low=0, high=100, shape=(self.game.gridSize[0], self.game.gridSize[1]), dtype=np.int32)
        self.state = self.game.grid
        self.lastScore = 0
        
        
    def step(self, action):
        reward = 0
        
        self.game.nextDirection = action
        
        self.game.step()
        
        if self.rightDirection():
            reward += 2
        else:
            reward -= 1
        
        if self.game.score > self.lastScore:
            reward += 500
            self.lastScore = self.game.score
            
        self.state = self.game.grid
        
        return np.array(self.state), reward, self.game.death, False, {"Steps": self.game.steps, "Score": self.game.score}
    
    
    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None,):
        if hasattr(self.game, "window"):
            try:
                self.game.window.destroy()
            except: pass
        self.game = Snake.SnakeGame()
        self.state = self.game.grid
        self.lastScore = 0
        return (np.array(self.state), {})
    
    
    def render(self):
        self.game.setupUI()
        self.game.window.update()
        pass
    
    def rightDirection(self):
        game = self.game
        if game.headPosX > game.foodPos[1] and game.direction == 0:
            return True
        elif game.headPosX < game.foodPos[1] and game.direction == 2:
            return True
        elif game.headPosX == game.foodPos[1]:
            if game.headPosY > game.foodPos[0] and game.direction == 1:
                return True
            elif game.headPosY < game.foodPos[0] and game.direction == 3:
                return True
        return False
            

In [15]:
env = SnakePlayer()

In [16]:
durchgaenge = 5  # Anzahl Durchgänge der Umgebung
for d in range(durchgaenge):  # Teste die Umgebung X mal
    state = env.reset()  # Setzt die Umgebung zurück und gibt einige Werte zurück
    done = False  # Umgebungen können Zeitlimitiert sein (hier 200ms)
    score = 0  # 
    
    while not done:
        env.render()  # Zeichne die Umgebung (falls möglich)
        action = env.action_space.sample()  # sample(): Wählt eine zufällige Action
        n_state, reward, done, term, info = env.step(action)
        score += reward
    print(f"Durchgang: {d + 1}, Reward: {score}, {info}")
env.close()

Durchgang: 1, Reward: -10, {'Steps': 16, 'Score': 0}
Durchgang: 2, Reward: -45, {'Steps': 74, 'Score': 0}
Durchgang: 3, Reward: -18, {'Steps': 44, 'Score': 0}
Durchgang: 4, Reward: -19, {'Steps': 21, 'Score': 0}
Durchgang: 5, Reward: -15, {'Steps': 15, 'Score': 0}


In [10]:
from stable_baselines3.common.env_checker import check_env

In [12]:
check_env(env, warn=True)

In [13]:
from stable_baselines3 import PPO

In [8]:
model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [13]:
model.learn(total_timesteps=2400000)

ImportError: You must install tqdm and rich in order to use the progress bar callback. It is included if you install stable-baselines with the extra packages: `pip install stable-baselines3[extra]`

In [10]:
model.save('Models/Snake')

In [12]:
durchgaenge = 10  # Anzahl Durchgänge der Umgebung
for d in range(durchgaenge):  # Teste die Umgebung X mal
    observation = env.reset()  # Setzt die Umgebung zurück und gibt einige Werte zurück
    done = False  # Umgebungen können Zeitlimitiert sein (hier 200ms)
    score = 0
    
    while not done:
        # env.render()  # Zeichne die Umgebung (falls möglich)
        if len(observation) == 2:
            observation = observation[0]
        action, _ = model.predict(observation)  # 2 Parameter: Action, next State (in wiederkehrenden Modellen)
        observation, reward, done, term, info = env.step(action)
        score += reward
    print(f"Durchgang: {d + 1}, Score: {score}, Steps: {info}")
env.close()

Durchgang: 1, Score: 0, Steps: {'Steps': 1500, 'Score': 0}
Durchgang: 2, Score: 0, Steps: {'Steps': 1995, 'Score': 0}
Durchgang: 3, Score: 0, Steps: {'Steps': 546, 'Score': 0}
Durchgang: 4, Score: 0, Steps: {'Steps': 703, 'Score': 0}
Durchgang: 5, Score: 10, Steps: {'Steps': 692, 'Score': 1}
Durchgang: 6, Score: 0, Steps: {'Steps': 942, 'Score': 0}
Durchgang: 7, Score: 10, Steps: {'Steps': 562, 'Score': 1}
Durchgang: 8, Score: 0, Steps: {'Steps': 1013, 'Score': 0}
Durchgang: 9, Score: 0, Steps: {'Steps': 792, 'Score': 0}
Durchgang: 10, Score: 0, Steps: {'Steps': 50, 'Score': 0}
