In [2]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
from enum import Enum
import pygame
import sys
from os import path

class NaveAction(Enum):
    izq = 0
    abajo = 1
    dcha = 2
    arriba = 3

class MisionEspacial:
    def __init__(self, grid_rows=4, grid_cols=5, fps=1):
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols
        self.reset()
        self.fps = fps
        self.last_action = ''
        self._init_pygame()

    def _init_pygame(self):
        pygame.init()
        pygame.display.init()
        self.clock = pygame.time.Clock()
        self.action_font = pygame.font.SysFont("Calibre", 30)
        self.action_info_height = self.action_font.get_height()
        self.cell_height = 64
        self.cell_width = 64
        self.cell_size = (self.cell_width, self.cell_height)
        self.window_size = (self.cell_width * self.grid_cols, self.cell_height * self.grid_rows + self.action_info_height)
        self.window_surface = pygame.display.set_mode(self.window_size)
        self.load_sprites()

    def load_sprites(self):
        file_name = path.join("nave.png")
        img = pygame.image.load(file_name)
        self.nave_img = pygame.transform.scale(img, self.cell_size)
        file_name = path.join("cielo.png")
        img = pygame.image.load(file_name)
        self.fondo_img = pygame.transform.scale(img, self.cell_size)
        file_name = path.join("saturno.png")
        img = pygame.image.load(file_name)
        self.target_img = pygame.transform.scale(img, self.cell_size)

    def reset(self, seed=None):
        self.agent_pos = [0, 0]
        random.seed(seed)
        self.target_pos = [
            random.randint(1, self.grid_rows-1),
            random.randint(1, self.grid_cols-1)
        ]

    def perform_action(self, nave_action: NaveAction) -> bool:
        self.last_action = nave_action
        if nave_action == NaveAction.izq and self.agent_pos[1] > 0:
            self.agent_pos[1] -= 1
        elif nave_action == NaveAction.dcha and self.agent_pos[1] < self.grid_cols-1:
            self.agent_pos[1] += 1
        elif nave_action == NaveAction.arriba and self.agent_pos[0] > 0:
            self.agent_pos[0] -= 1
        elif nave_action == NaveAction.abajo and self.agent_pos[0] < self.grid_rows-1:
            self.agent_pos[0] += 1
        return self.agent_pos == self.target_pos

    def render(self, mode='human'):
        self.window_surface.fill((0,0,0))
        for r in range(self.grid_rows):
            for c in range(self.grid_cols):
                pos = (c * self.cell_width, r * self.cell_height)
                if [r, c] == self.agent_pos:
                    self.window_surface.blit(self.nave_img, pos)
                elif [r, c] == self.target_pos:
                    self.window_surface.blit(self.target_img, pos)
                else:
                    self.window_surface.blit(self.fondo_img, pos)
        text_img = self.action_font.render(f'Acción: {self.last_action}', True, (0,0,0), (255,255,255))
        text_pos = (0, self.window_size[1] - self.action_info_height)
        self.window_surface.blit(text_img, text_pos)
        pygame.display.update()
        self.clock.tick(self.fps)

    def _process_events(self):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit()
            if event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE:
                pygame.quit()
                sys.exit()

class MisionEspacialEnv(gym.Env):
    metadata = {"render_modes": ["human"], 'render_fps': 4}

    def __init__(self, grid_rows=4, grid_cols=5, render_mode=None):
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols
        self.render_mode = render_mode
        self.mision_espacial = MisionEspacial(grid_rows=grid_rows, grid_cols=grid_cols, fps=self.metadata['render_fps'])
        self.action_space = spaces.Discrete(len(NaveAction))
        self.observation_space = spaces.Box(
            low=0,
            high=np.array([self.grid_rows-1, self.grid_cols-1, self.grid_rows-1, self.grid_cols-1]),
            shape=(4,),
            dtype=np.int32
        )

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.mision_espacial.reset(seed=seed)
        obs = np.concatenate((self.mision_espacial.agent_pos, self.mision_espacial.target_pos))
        info = {}
        if self.render_mode == 'human':
            self.render()
        return obs, info

    def step(self, action):
        target_reached = self.mision_espacial.perform_action(NaveAction(action))
        reward = 0
        terminated = False
        if target_reached:
            reward = 1
            terminated = True
        obs = np.concatenate((self.mision_espacial.agent_pos, self.mision_espacial.target_pos))
        info = {}
        if self.render_mode == 'human':
            print(NaveAction(action))
            self.render()
        return obs, reward, terminated, False, info

    def render(self):
        self.mision_espacial.render()


In [3]:
import gymnasium as gym
from gymnasium.envs.registration import register

register(
    id='v0_mision_espacial-v0',  
    entry_point='__main__:MisionEspacialEnv',  
)

env = gym.make('v0_mision_espacial-v0', render_mode='human')
obs = env.reset()[0]
while True:
    rand_action = env.action_space.sample()
    obs, reward, terminated, _, _ = env.step(rand_action)
    if terminated:
        break
env.close()


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


NaveAction.arriba


  if not isinstance(terminated, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


NaveAction.arriba
NaveAction.abajo
NaveAction.arriba
NaveAction.abajo
NaveAction.arriba
NaveAction.arriba
NaveAction.izq
NaveAction.dcha
NaveAction.dcha
NaveAction.abajo
NaveAction.dcha
NaveAction.abajo
NaveAction.izq
NaveAction.dcha
NaveAction.izq
NaveAction.izq
NaveAction.arriba
NaveAction.arriba
NaveAction.izq
NaveAction.arriba
NaveAction.dcha
NaveAction.izq
NaveAction.dcha
NaveAction.arriba
NaveAction.abajo
NaveAction.dcha
NaveAction.abajo
NaveAction.abajo
NaveAction.izq
NaveAction.arriba
NaveAction.arriba
NaveAction.izq
NaveAction.arriba
NaveAction.abajo
NaveAction.izq
NaveAction.arriba
NaveAction.arriba
NaveAction.abajo
NaveAction.arriba
NaveAction.arriba
NaveAction.abajo
NaveAction.abajo
NaveAction.dcha
NaveAction.izq
NaveAction.arriba
NaveAction.dcha
NaveAction.izq
NaveAction.dcha
NaveAction.izq
NaveAction.abajo
NaveAction.izq
NaveAction.dcha
NaveAction.abajo
NaveAction.dcha
NaveAction.abajo
NaveAction.arriba
NaveAction.dcha
NaveAction.dcha
NaveAction.arriba


In [4]:
import random
from enum import Enum
import pygame
import sys
from os import path
import gymnasium as gym
from gymnasium import spaces
from gymnasium.envs.registration import register
import numpy as np

# Acciones posibles que puede hacer la nave:
class NaveAction(Enum):
    izq = 0
    abajo = 1
    dcha = 2
    arriba = 3

# Distintos elementos (casillas) que componen el juego
class Etiquetas(Enum):
    agua = 0
    nave = 1
    objetivo = 2

# Al imprimir un objeto de la clase Etiquetas, nos muestra solo el primer caracter del nombre del elemento
def __str__(self):
    return self.name[:1]

class MisionEspacialEnv(gym.Env):
    metadata = {"render_modes": ["human"], 'render_fps': 4}

    def __init__(self, grid_rows=4, grid_cols=5, render_mode=None, fps=4):
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols
        self.render_mode = render_mode
        self.fps = fps
        self.last_action = ''
        self._init_pygame()
        self.reset()

        self.action_space = spaces.Discrete(len(NaveAction))
        self.observation_space = spaces.Box(
            low=0,
            high=np.array([self.grid_rows-1, self.grid_cols-1, self.grid_rows-1, self.grid_cols-1]),
            shape=(4,),
            dtype=np.int32
        )

    def _init_pygame(self):
        pygame.init()
        pygame.display.init()
        self.clock = pygame.time.Clock()
        self.action_font = pygame.font.SysFont("Calibre", 30)
        self.action_info_height = self.action_font.get_height()
        self.cell_height = 64
        self.cell_width = 64
        self.cell_size = (self.cell_width, self.cell_height)
        self.window_size = (self.cell_width * self.grid_cols, self.cell_height * self.grid_rows + self.action_info_height)
        self.window_surface = pygame.display.set_mode(self.window_size)
        self.load_sprites()

    def load_sprites(self):
        file_name = path.join("nave.png")
        img = pygame.image.load(file_name)
        self.nave_img = pygame.transform.scale(img, self.cell_size)

        file_name = path.join("cielo.png")
        img = pygame.image.load(file_name)
        self.fondo_img = pygame.transform.scale(img, self.cell_size)

        file_name = path.join("saturno.png")
        img = pygame.image.load(file_name)
        self.objetivo_img = pygame.transform.scale(img, self.cell_size)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.agent_pos = [0, 0]
        random.seed(seed)
        self.target_pos = [
            random.randint(1, self.grid_rows-1),
            random.randint(1, self.grid_cols-1)
        ]
        obs = np.concatenate((self.agent_pos, self.target_pos))
        return obs, {}

    def perform_action(self, nave_action: NaveAction) -> bool:
        self.last_action = nave_action
        if nave_action == NaveAction.izq:
            if self.agent_pos[1] > 0:
                self.agent_pos[1] -= 1
        elif nave_action == NaveAction.dcha:
            if self.agent_pos[1] < self.grid_cols-1:
                self.agent_pos[1] += 1
        elif nave_action == NaveAction.arriba:
            if self.agent_pos[0] > 0:
                self.agent_pos[0] -= 1
        elif nave_action == NaveAction.abajo:
            if self.agent_pos[0] < self.grid_rows-1:
                self.agent_pos[0] += 1
        return self.agent_pos == self.target_pos

    def step(self, action):
        target_reached = self.perform_action(NaveAction(action))
        reward = 1 if target_reached else 0
        terminated = target_reached
        obs = np.concatenate((self.agent_pos, self.target_pos))
        return obs, reward, terminated, False, {}

    def render(self):
        self.window_surface.fill((0, 0, 0))
        for r in range(self.grid_rows):
            for c in range(self.grid_cols):
                pos = (c * self.cell_width, r * self.cell_height)
                if [r, c] == self.agent_pos:
                    self.window_surface.blit(self.nave_img, pos)
                elif [r, c] == self.target_pos:
                    self.window_surface.blit(self.objetivo_img, pos)
                else:
                    self.window_surface.blit(self.fondo_img, pos)
        text_img = self.action_font.render(f'Action: {self.last_action}', True, (0, 0, 0), (255, 255, 255))
        text_pos = (0, self.window_size[1] - self.action_info_height)
        self.window_surface.blit(text_img, text_pos)
        pygame.display.update()
        self.clock.tick(self.fps)

    def close(self):
        pygame.quit()
        sys.exit()

# Registro del entorno personalizado
register(
    id='v0_mision_espacial-v0',
    entry_point='__main__:MisionEspacialEnv',
)

# Creación del entorno
env = gym.make('v0_mision_espacial-v0', render_mode='human')


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [10]:
import numpy as np
import random
import math

# Assuming 'env' is imported and defined somewhere in your code
# For example:
# from my_environment import MyEnvironment
# env = MyEnvironment()

# Mocking 'env' for demonstration
class MyEnvironment:
    def __init__(self):
        self.grid_rows = 10
        self.grid_cols = 10
        self.action_space = [0, 1, 2, 3]  # Example action space

    def reset(self):
        return np.zeros((self.grid_rows, self.grid_cols)), 0

    def step(self, action):
        return np.zeros((self.grid_rows, self.grid_cols)), 0, False, {}, {}

# Configuración de hiperparámetros
episodes = 10
discount = 0.95
episodes_display = 10
learning_rate = 0.25
epsilon = 0.2

# Inicializar tabla Q con valores aleatorios
env = MyEnvironment()  # Mock environment
q_table = np.random.randn(env.grid_rows, env.grid_cols, len(env.action_space))

# Almacenamiento de las recompensas totales por episodio
ep_rewards = []
ep_rewards_table = {'ep': [], 'avg': [], 'min': [], 'max': []}

def discretised_state(state):
    return tuple(state.astype(int))

for episode in range(episodes):
    episode_reward = 0
    state_array, _ = env.reset()
    curr_discrete_state = discretised_state(state_array)
    done = False

    if episode % episodes_display == 0:
        render_state = True
    else:
        render_state = False

    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(q_table[curr_discrete_state])
        else:
            action = random.choice(env.action_space)

        new_state_array, reward, done, _, _ = env.step(action)
        new_discrete_state = discretised_state(new_state_array)

        if render_state:
            print(state_array)  # Rendering the state

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[curr_discrete_state][:][:][action]
            new_q = current_q + learning_rate * (reward + discount * max_future_q - current_q)
            q_table[curr_discrete_state][:][:][action] = new_q

        curr_discrete_state = new_discrete_state
        episode_reward += reward

    ep_rewards.append(episode_reward)

    if not episode % episodes_display:
        avg_reward = sum(ep_rewards[-episodes_display:]) / episodes_display
        ep_rewards_table['ep'].append(episode)
        ep_rewards_table['avg'].append(avg_reward)
        ep_rewards_table['min'].append(min(ep_rewards[-episodes_display:]))
        ep_rewards_table['max'].append(max(ep_rewards[-episodes_display:]))
        print(f"Episode:{episode} avg:{avg_reward} min:{min(ep_rewards[-episodes_display:])} max:{max(ep_rewards[-episodes_display:])}")

# env.close()  # Uncomment if your environment has a close method


IndexError: too many indices for array: array is 3-dimensional, but 10 were indexed

In [9]:
import numpy as np
import random
import math

# Assuming 'env' is imported and defined somewhere in your code
# For example:
# from my_environment import MyEnvironment
# env = MyEnvironment()

# Mocking 'env' for demonstration
class MyEnvironment:
    def __init__(self):
        self.grid_rows = 10
        self.grid_cols = 10
        self.action_space = [0, 1, 2, 3]  # Example action space

    def reset(self):
        return np.zeros((self.grid_rows, self.grid_cols)), 0

    def step(self, action):
        return np.zeros((self.grid_rows, self.grid_cols)), 0, False, {}, {}

# Configuración de hiperparámetros
episodes = 10
discount = 0.95
episodes_display = 10
learning_rate = 0.25
epsilon = 0.2

# Inicializar tabla Q con valores aleatorios
env = MyEnvironment()  # Mock environment
q_table = np.random.randn(env.grid_rows, env.grid_cols, len(env.action_space))

# Almacenamiento de las recompensas totales por episodio
ep_rewards = []
ep_rewards_table = {'ep': [], 'avg': [], 'min': [], 'max': []}

def discretised_state(state):
    return tuple(state.astype(int))

for episode in range(episodes):
    episode_reward = 0
    state_array, _ = env.reset()
    curr_discrete_state = discretised_state(state_array)
    done = False

    if episode % episodes_display == 0:
        render_state = True
    else:
        render_state = False

    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(q_table[curr_discrete_state])
        else:
            action = random.choice(env.action_space)

        new_state_array, reward, done, _, _ = env.step(action)
        new_discrete_state = discretised_state(new_state_array)

        if render_state:
            print(state_array)  # Rendering the state

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[curr_discrete_state][:][action]
            new_q = current_q + learning_rate * (reward + discount * max_future_q - current_q)
            q_table[curr_discrete_state][:][action] = new_q

        curr_discrete_state = new_discrete_state
        episode_reward += reward

    ep_rewards.append(episode_reward)

    if not episode % episodes_display:
        avg_reward = sum(ep_rewards[-episodes_display:]) / episodes_display
        ep_rewards_table['ep'].append(episode)
        ep_rewards_table['avg'].append(avg_reward)
        ep_rewards_table['min'].append(min(ep_rewards[-episodes_display:]))
        ep_rewards_table['max'].append(max(ep_rewards[-episodes_display:]))
        print(f"Episode:{episode} avg:{avg_reward} min:{min(ep_rewards[-episodes_display:])} max:{max(ep_rewards[-episodes_display:])}")

# env.close()  # Uncomment if your environment has a close method


IndexError: too many indices for array: array is 3-dimensional, but 10 were indexed

In [5]:
import numpy as np
import random
import math

# Configuración de hiperparámetros
episodes = 10
discount = 0.95
episodes_display = 10
learning_rate = 0.25
epsilon = 0.2

# Inicializar tabla Q con valores aleatorios
q_table = np.random.randn(env.grid_rows, env.grid_cols, env.grid_rows, env.grid_cols, env.action_space.n)

# Almacenamiento de las recompensas totales por episodio
ep_rewards = []
ep_rewards_table = {'ep': [], 'avg': [], 'min': [], 'max': []}

def discretised_state(state):
    return tuple(state.astype(int))

for episode in range(episodes):
    episode_reward = 0
    state_array = env.reset()[0]
    curr_discrete_state = discretised_state(state_array)
    done = False
    i = 0

    if episode % episodes_display == 0:
        render_state = True
    else:
        render_state = False

    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(q_table[curr_discrete_state])
        else:
            action = np.random.randint(0, env.action_space.n)

        new_state_array, reward, done, _, _ = env.step(action)
        new_discrete_state = discretised_state(new_state_array)

        if render_state:
            env.render()

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[curr_discrete_state + (action,)]
            new_q = current_q + learning_rate * (reward + discount * max_future_q - current_q)
            q_table[curr_discrete_state + (action,)] = new_q

        curr_discrete_state = new_discrete_state
        episode_reward += reward

    ep_rewards.append(episode_reward)

    if not episode % episodes_display:
        avg_reward = sum(ep_rewards[-episodes_display:])/len(ep_rewards[-episodes_display:])
        ep_rewards_table['ep'].append(episode)
        ep_rewards_table['avg'].append(avg_reward)
        ep_rewards_table['min'].append(min(ep_rewards[-episodes_display:]))
        ep_rewards_table['max'].append(max(ep_rewards[-episodes_display:]))
        print(f"Episode:{episode} avg:{avg_reward} min:{min(ep_rewards[-episodes_display:])} max:{max(ep_rewards[-episodes_display:])}")

env.close()


Episode:0 avg:1.0 min:1 max:1


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
