In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random

In [2]:
class GridWorldEnv(gym.Env):
    metadata = {'render_modes': ['human', 'ansi']}
    
    def __init__(self, size=5, render_mode='ansi'):
        self.size = size
        self.window_size = 512
        self.observation_space = spaces.MultiDiscrete([size, size]) # (x, y) coordinates
        self.action_space = spaces.Discrete(4) # 0: up, 1: down, 2: left, 3: right (Makes sense for a grid)
        self._action_to_direction = {
            0: np.array([1, 0]),   # Down
            1: np.array([-1, 0]),  # Up
            2: np.array([0, 1]),   # Right
            3: np.array([0, -1]),  # Left
        }
        self.render_mode = render_mode

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._agent_location = self.np_random.integers(0, self.size, size=2) # random location for agent
        self._target_location = np.array([self.size - 1, self.size - 1]) # fixed location for target and pit
        self._pit_location = np.array([self.size // 2, self.size // 2])

        while np.array_equal(self._agent_location, self._target_location) or np.array_equal(self._agent_location, self._pit_location): # while agent is on target or pit change location
            self._agent_location = self.np_random.integers(0, self.size, size=2, dtype=int)
        observation = self._get_obs() # get initial observation, info
        info = self._get_info()
        return observation, info
    
    def step(self, action): # pass state and action to step function
        # P section: transition dynamics
        direction = self._action_to_direction[action] # get direction from action
        self._agent_location = np.clip(self._agent_location + direction, 0, self.size - 1) # change agent location based on action, clip to stay in bounds
        terminated = np.array_equal(self._agent_location, self._target_location) or np.array_equal(self._agent_location, self._pit_location)# check if agent is on target or pit

        # R: Reward function section
        if np.array_equal(self._agent_location, self._target_location):
            reward = 10.0
        elif np.array_equal(self._agent_location, self._pit_location):
            reward = -10.0
        else:
            reward = -0.1
        
        observation = self._get_obs() # get new observation, info
        info = self._get_info()
        truncated = False # no truncation in this environment
        return observation, reward, terminated, truncated, info

# Helper/Convenience functions
    def _get_obs(self):
        return self._agent_location

    def _get_info(self):
        # Provides the Chebyshev distance to the target, useful for heuristics
        return {"distance": np.max(np.abs(self._agent_location - self._target_location))}

    def render(self):
        if self.render_mode == "ansi":
            grid = np.full((self.size, self.size), "_", dtype=str)
            grid[tuple(self._agent_location)] = "A" # Agent
            grid[tuple(self._target_location)] = "G" # Goal
            grid[tuple(self._pit_location)] = "P"   # Pit
            print("\n".join([" ".join(row) for row in grid]))
            print("-" * (2 * self.size))

    def close(self):
        pass # No resources to close in this simple env



In [None]:
# Testing the environment
env = GridWorldEnv(size=5, render_mode='ansi')
observation, info = env.reset()
env.render()

for i in range(35): # Run for a maximum of 35 steps
    action = env.action_space.sample() # Random action
    print(f"Step {i+1}: Taking action {action}")
    observation, reward, terminated, truncated, info = env.step(action)
    env.render()
    print(f"Observation: {observation}, Reward: {reward}, Terminated: {terminated}")
    print(f"Info: {info}")

    if terminated:
        print("Episode finished!")
        observation, info = env.reset()
        env.render()
        break
env.close()

_ _ _ _ _
_ _ _ _ _
_ _ P _ _
A _ _ _ _
_ _ _ _ G
----------
Step 1: Taking action 2
_ _ _ _ _
_ _ _ _ _
_ _ P _ _
_ A _ _ _
_ _ _ _ G
----------
Observation: [3 1], Reward: -0.1, Terminated: False
Info: {'distance': np.int64(3)}
Step 2: Taking action 3
_ _ _ _ _
_ _ _ _ _
_ _ P _ _
A _ _ _ _
_ _ _ _ G
----------
Observation: [3 0], Reward: -0.1, Terminated: False
Info: {'distance': np.int64(4)}
Step 3: Taking action 1
_ _ _ _ _
_ _ _ _ _
A _ P _ _
_ _ _ _ _
_ _ _ _ G
----------
Observation: [2 0], Reward: -0.1, Terminated: False
Info: {'distance': np.int64(4)}
Step 4: Taking action 3
_ _ _ _ _
_ _ _ _ _
A _ P _ _
_ _ _ _ _
_ _ _ _ G
----------
Observation: [2 0], Reward: -0.1, Terminated: False
Info: {'distance': np.int64(4)}
Step 5: Taking action 3
_ _ _ _ _
_ _ _ _ _
A _ P _ _
_ _ _ _ _
_ _ _ _ G
----------
Observation: [2 0], Reward: -0.1, Terminated: False
Info: {'distance': np.int64(4)}
Step 6: Taking action 0
_ _ _ _ _
_ _ _ _ _
_ _ P _ _
A _ _ _ _
_ _ _ _ G
----------
Observati