In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pygame
import random

CAR_COLORS = [(255, 0, 0)]

class CrossyRoadEnv(gym.Env):
    
    def __init__(self):
        super().__init__()
        
        self.width = 40
        self.height = 25
        self.observation_space = spaces.Box(low=0, high=5, shape=(self.height, self.width), dtype=np.int32)
        self.action_space = spaces.Discrete(5)  # 0: up, 1: down, 2: left, 3: right
        
        # Inicjalizacja PyGame do renderowania
        pygame.init()
        self.screen = pygame.display.set_mode((self.width * 40, self.height * 40))
        self.clock = pygame.time.Clock()
        
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self.steps = 0  # Initialize step counter
        self.max_steps = 500 

        self.agent_pos = [self.width // 2, self.height - 1]
        self.cars = []
        self.logs = []
        self.water_rows = set()
        self.car_rows = set()
        self.done = False
        self.score = 0

        reserved_rows = [0, self.height - 0, self.height - 1, self.height - 2]  # Zarezerwowane wiersze
        all_valid_rows = [y for y in range(self.height - 1) if y not in reserved_rows]

        max_water = max(1, int(0.4 * len(all_valid_rows)))
        min_water = max(1, int(0.2 * len(all_valid_rows)))
        water_rand = random.randint(min_water, max_water)
        self.water_rows = set(random.sample(all_valid_rows, min(water_rand, len(all_valid_rows))))
        
        possible_car_rows = list(set(all_valid_rows) - self.water_rows)
        max_road = max(1, int(0.4 * len(possible_car_rows)))
        min_road = max(1, int(0.2 * len(possible_car_rows)))
        road_rand = random.randint(min_road, max_road)
        self.car_rows = set(random.sample(possible_car_rows, min(road_rand, len(possible_car_rows))))


        # Twórz auta – 1–5 na drogę
        for row in self.car_rows:
            direction = random.choice([-1, 1])
            num_cars = random.randint(1, 5)
            for _ in range(num_cars):
                x = random.randint(0, self.width - 1)
                color = random.choice(CAR_COLORS)
                self.cars.append([x, row, direction, color])


        # Twórz kłody – po 1–3 na rzekę (długość 3)
        prev_direction = None
        for row in sorted(self.water_rows):
            # Wymuś inny kierunek niż w poprzednim wierszu
            direction = random.choice([-1, 1])
            if prev_direction is not None:
                direction = -prev_direction
            prev_direction = direction

            for _ in range(random.randint(1, 3)):
                start_x = random.randint(0, self.width - 3)
                self.logs.append([start_x, row, direction])
                
        # Dodaj skały – np. 20 losowych
        self.rocks = []
        rocks_count = np.random.randint(0.1 * self.width * self.height, 0.15 * self.width * self.height)
        for _ in range(rocks_count):
            attempt = 0
            while attempt < 100:  # nie nieskończoność
                rx = random.randint(0, self.width - 1)
                ry = random.randint(1, self.height - 1)

                if (
                    ry in self.water_rows or
                    ry in self.car_rows or
                    ry >= self.height - 4 or
                    [rx, ry] == self.agent_pos or
                    [rx, ry] in self.rocks
                ):
                    attempt += 1
                    continue
                
                self.rocks.append([rx, ry])
                break


        return self._get_obs(), {}

    def step(self, action):
        if self.done:
            return self._get_obs(), 0, True, False, {}
        
        self.steps += 1
        if self.steps >= self.max_steps:
            return self._get_obs(), -15.0, True, False, {"TimeLimit.truncated": True}
    
        reward = 0.0
        old_x, old_y = self.agent_pos
    
        # Movement logic (unchanged)
        x, y = self.agent_pos
        if action == 0 and y > 0: y -= 1
        elif action == 1 and y < self.height - 1: y += 1
        elif action == 2 and x > 0: x -= 1
        elif action == 3 and x < self.width - 1: x += 1
        elif action == 4: pass  # no-op
        
        self.agent_pos = [x, y]
    
        # ---- New Reward Shaping ----
        # Big reward for upward movement
        if y < old_y: reward += 4.0
        
        # Penalize hesitation
        if self.agent_pos == [old_x, old_y]: reward -= 2.0
        
        # Small penalty for moving down
        elif y > old_y: reward -= 0.5
        
        # Time bonus (faster = better)
        reward += 0.1 * (self.max_steps - self.steps)
    
        # ---- Updated Failure Penalties ----
        # Collision with car (less severe)
        for car in self.cars:
            if self.agent_pos == [car[0], car[1]]:
                return self._get_obs(), -5.0, True, False, {}
        
        # Drowning (less severe)
        if self.agent_pos[1] in self.water_rows:
            on_log = False
            for log in self.logs:
                if log[1] == self.agent_pos[1] and log[0] <= self.agent_pos[0] < log[0] + 3:
                    on_log = True
                    break
            if not on_log:
                return self._get_obs(), -5.0, True, False, {}
    
        # Goal (unchanged, still awesome)
        if self.agent_pos[1] == 0:
            reward += 50.0
            return self._get_obs(), reward, True, False, {}
    
        return self._get_obs(), reward, False, False, {}


    
    def _get_obs(self):
        grid = np.zeros((self.height, self.width), dtype=np.int32)

        # Pola drogi
        for y in self.car_rows:
            for x in range(self.width):
                grid[y, x] = 5

        grid[self.agent_pos[1], self.agent_pos[0]] = 1  # Agent

        for car in self.cars:
            if 0 <= car[1] < self.height and 0 <= car[0] < self.width:
                grid[car[1], car[0]] = 2

        for log in self.logs:
            for i in range(3):
                lx = log[0] + i
                if 0 <= lx < self.width:
                    grid[log[1], lx] = 3

        # Zaznacz wodę tam, gdzie nie ma kłody
        for y in self.water_rows:
            for x in range(self.width):
                if grid[y, x] == 0:
                    grid[y, x] = 4

        # Skały jako np. wartość 5 w macierzy
        for rock in self.rocks:
            grid[rock[1], rock[0]] = 5


        return grid

    
    def render(self):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                exit()


        self.screen.fill((170, 220 ,60))
        
        # Rysuj tło – puste pola
        for y in range(self.height):
            for x in range(self.width):
                pygame.draw.rect(self.screen, (125, 235, 80), (x * 40, y * 40, 40, 40))  # jasna zieleń

        # Rysuj wodę
        for y in self.water_rows:
            for x in range(self.width):
                if all(not (log[1] == y and log[0] <= x < log[0] + 3) for log in self.logs):
                    pygame.draw.rect(self.screen, (0, 180, 215), (x * 40, y * 40, 40, 40))

        
        # Rysuj drogę
        for y in self.car_rows:
            for x in range(self.width):
                pygame.draw.rect(self.screen, (160, 130, 100), (x * 40, y * 40, 40, 40))  # ciemnoszary asfalt
        
        # Rysuj samochody
        for car in self.cars:
            pygame.draw.rect(self.screen, (255, 55, 55), (car[0] * 40, car[1] * 40, 40, 40))

        for log in self.logs:
            for i in range(3):
                lx = log[0] + i
                if 0 <= lx < self.width:
                    pygame.draw.rect(self.screen, (145, 65, 30), (lx * 40, log[1] * 40, 40, 40))

        # Rysuj skały
        for rock in self.rocks:
            pygame.draw.rect(self.screen, (100, 100, 100), (rock[0] * 40, rock[1] * 40, 40, 40))

        
        pygame.draw.circle(self.screen, (255, 200, 0), (self.agent_pos[0] * 40 + 20, self.agent_pos[1] * 40 + 20), 20)
        
        pygame.display.flip()
        self.clock.tick(12)  # 10 FPS
    
    def close(self):
        pygame.quit()

In [4]:
class OneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env, num_values=6):
        super().__init__(env)
        self.num_values = num_values
        height, width = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0.0,
            high=1.0,
            shape=(height, width, num_values),  # (H, W, C)
            dtype=np.float32,
        )

    def observation(self, obs):
        one_hot = np.eye(self.num_values)[obs]  # (H, W, C)
        return one_hot.astype(np.float32)

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
import torch as th

class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=128):
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten()
        )
        with th.no_grad():
            sample_input = th.as_tensor(observation_space.sample()[None]).float()
            n_flatten = self.cnn(sample_input).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations):
        return self.linear(self.cnn(observations))


In [5]:
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

env = make_vec_env(lambda: OneHotWrapper(CrossyRoadEnv()), n_envs=4)

model = A2C(
    "CnnPolicy",
    env,
    verbose=1,
    learning_rate=0.0003,
    n_steps=128,
    gamma=0.99,
    ent_coef=0.01,
    policy_kwargs=dict(
        normalize_images=False,
        features_extractor_class=CustomCNN,
    ),
)




# Callback do oceny postępów
eval_callback = EvalCallback(
    env,
    eval_freq=1000,
    best_model_save_path="./a2c_crossyroad/",
)

# Trening!
model.learn(total_timesteps=200_000, callback=eval_callback)

# Zapisz model
model.save("a2c_crossyroad")

Using cuda device
Eval num_timesteps=4000, episode_reward=11462.00 +/- 0.00
Episode length: 500.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 500      |
|    mean_reward        | 1.15e+04 |
| time/                 |          |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | 4.89e-06 |
|    learning_rate      | 0.0003   |
|    n_updates          | 7        |
|    policy_loss        | 1.5e+03  |
|    value_loss         | 1.58e+06 |
------------------------------------
New best mean reward!
Eval num_timesteps=8000, episode_reward=11462.00 +/- 0.00
Episode length: 500.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 500      |
|    mean_reward        | 1.15e+04 |
| time/                 |          |
|    total_timesteps    | 8000     |
| train/                |          |
|  

In [9]:
# Ładowanie modelu
model = A2C.load("a2c_crossyroad/best_model.zip")


env = OneHotWrapper(CrossyRoadEnv())  # Użyj wrappera, tak jak przy trenowaniu
obs, _ = env.reset()

for _ in range(1000):
    action, _ = model.predict(obs)
    obs, _, done, _, _ = env.step(action)
    env.render()
    if done:
        obs, _ = env.reset()



KeyboardInterrupt: 

In [None]:

# if __name__ == "__main__":
#     env = CrossyRoadEnv()
#     obs, _ = env.reset()
#     for _ in range(100):
#         action = env.action_space.sample()
#         obs, reward, done, _, _ = env.step(action)
#         env.render()
#         if done:
#             obs, _ = env.reset()
#     env.close()


# Uczenie

In [None]:
# from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
# from stable_baselines3.common.env_util import make_vec_env
# from stable_baselines3 import DQN

# env = make_vec_env(lambda: CrossyRoadEnv(), n_envs=1)
# env = VecTransposeImage(env)  # (C, H, W) for CNN

# # Zbuduj model DQN
# model = DQN(
#     policy="CnnPolicy",
#     env=env,
#     exploration_fraction=0.3,
#     exploration_final_eps=0.05,
#     learning_rate=1e-4,
#     buffer_size=50000,
#     learning_starts=1000,
#     batch_size=64,
#     tau=1.0,
#     gamma=0.99,
#     train_freq=4,
#     target_update_interval=1000,
#     verbose=1
# )

# # Trenuj agenta
# model.learn(total_timesteps=100_000)

# # Zapisz model
# model.save("dqn_crossy_model")

# print("Trening zakończony. Model zapisany jako 'dqn_crossy_model'.")


## Uruchomienie po learningu

In [None]:
# import time
# from stable_baselines3 import DQN
# from gymnasium.wrappers import FlattenObservation

# env = CrossyRoadEnv()
# env = FlattenObservation(env)

# model = DQN.load("dqn_crossy_model")

# obs, _ = env.reset()
# start_time = time.time()

# while True:
#     action, _ = model.predict(obs)
#     obs, reward, done, _, _ = env.step(action)
#     env.render()

#     # Sprawdzenie czasu
#     if time.time() - start_time > 60:
#         print("Koniec po 60 sekundach.")
#         break

#     if done:
#         obs, _ = env.reset()
