In [1]:
# importing all useful functions 
import math
import matplotlib.pyplot as plt
import numpy as np
from time import sleep
import gymnasium as gym
from gymnasium import Env, spaces, register, make
import random
import copy

In [7]:
class RandomMaze(Env):
    
    
    #----- 1 -----
    #constructor for initialization and some helper functions
    
    
    def __init__(self):
        
        #P is basically State: Action: [ Transition Probability , Next state , Reward , isTerminated?]
        # for actions : 0 -> up 1-> right 2->down 3-> left (clockwise)
        self.P = {
            0: {
                0: [(0.8, 0, -0.04, False),(0.1, 1, -0.04, False),(0.1, 0, -0.04, False)],
                1: [(0.8, 1, -0.04, False),(0.1, 0, -0.04, False),(0.1, 4, -0.04, False)],
                2: [(0.8, 4, -0.04, False),(0.1, 1, -0.04, False),(0.1, 0, -0.04, False)],
                3: [(0.8, 0, -0.04, False),(0.1, 0, -0.04, False),(0.1, 4, -0.04, False)]
            },
            1: {
                0: [(0.8, 1, -0.04, False),(0.1, 2, -0.04, False),(0.1, 0, -0.04, False)],
                1: [(0.8, 2, -0.04, False),(0.1, 1, -0.04, False),(0.1, 6, -0.04, False)],
                2: [(0.8, 1, -0.04, False),(0.1, 2, -0.04, False),(0.1, 0, -0.04, False)],
                3: [(0.8, 0, -0.04, False),(0.1, 1, -0.04, False),(0.1, 1, -0.04, False)]
            },
            2: {
                0: [(0.8, 2, -0.04, False),(0.1, 1, -0.04, False),(0.1, 3,  1.00, True)],
                1: [(0.8, 3,  1.00, True ),(0.1, 2, -0.04, False),(0.1, 6, -0.04, False)],
                2: [(0.8, 6, -0.04, False),(0.1, 1, -0.04, False),(0.1, 3,  1.00, True)],
                3: [(0.8, 1, -0.04, False),(0.1, 2, -0.04, False),(0.1, 6, -0.04, False)]
            },
            3: {
                0: [(1.0, 3, 0.00, True)],         # Goal
                1: [(1.0, 3, 0.00, True)],
                2: [(1.0, 3, 0.00, True)],
                3: [(1.0, 3, 0.00, True)]
            },
            4: {
                0: [(0.8, 0, -0.04, False),(0.1, 4, -0.04, False),(0.1, 4, -0.04, False)],
                1: [(0.8, 4, -0.04, False),(0.1, 0, -0.04, False),(0.1, 8, -0.04, False)],
                2: [(0.8, 8, -0.04, False),(0.1, 4, -0.04, False),(0.1, 4, -0.04, False)],
                3: [(0.8, 4, -0.04, False),(0.1, 0, -0.04, False),(0.1, 8, -0.04, False)]
            },
            5: {
                0: [(1.0, 5, 0.00, True)],
                1: [(1.0, 5, 0.00, True)],         # wall
                2: [(1.0, 5, 0.00, True)],
                3: [(1.0, 5, 0.00, True)]
            },
            6: {
                0: [(0.8, 2, -0.04, False),(0.1, 6, -0.04, False),(0.1, 7, -1.00,  True)],
                1: [(0.8, 7, -1.00, True ),(0.1, 2, -0.04, False),(0.1, 10, -0.04, False)],
                2: [(0.8, 10, -0.04,False),(0.1, 6, -0.04, False),(0.1, 7, -1.00,  True)],
                3: [(0.8, 6, -0.04, False),(0.1, 2, -0.04, False),(0.1, 10, -0.04, False)]
            },
            7: {
                0: [(1.0, 7, 0.00, True)],
                1: [(1.0, 7, 0.00, True)],        # hole
                2: [(1.0, 7, 0.00, True)],
                3: [(1.0, 7, 0.00, True)]
            },
            8: {
                0: [(0.8, 4, -0.04, False),(0.1, 9, -0.04, False),(0.1, 8, -0.04, False)],
                1: [(0.8, 9, -0.04, False),(0.1, 4, -0.04, False),(0.1, 8, -0.04, False)],
                2: [(0.8, 8, -0.04, False),(0.1, 8, -0.04, False),(0.1, 9, -0.04, False)],
                3: [(0.8, 8, -0.04, False),(0.1, 4, -0.04, False),(0.1, 8, -0.04, False)]
            },
            9: {
                0: [(0.8, 9, -0.04, False),(0.1, 10, -0.04, False),(0.1, 8, -0.04, False)],
                1: [(0.8, 10, -0.04, False),(0.1, 9, -0.04, False),(0.1, 9, -0.04, False)],
                2: [(0.8, 9, -0.04, False),(0.1, 8, -0.04, False),(0.1, 10, -0.04, False)],
                3: [(0.8, 8, -0.04, False),(0.1, 9, -0.04, False),(0.1, 9, -0.04, False)]
            },
            10: {
                0: [(0.8, 6, -0.04, False),(0.1, 9, -0.04, False),(0.1, 11, -0.04, False)],
                1: [(0.8, 11, -0.04, False),(0.1, 6, -0.04, False),(0.1, 10, -0.04, False)],
                2: [(0.8, 10, -0.04, False),(0.1, 11, -0.04, False),(0.1, 9, -0.04, False)],
                3: [(0.8, 9, -0.04, False),(0.1, 6, -0.04, False),(0.1, 10, -0.04, False)]
            },
            11: {
                0: [(0.8, 7, -1.00, True ),(0.1, 10, -0.04, False),(0.1, 11, -0.04, False)],
                1: [(0.8, 11, -0.04, False),(0.1, 11, -0.04, False),(0.1, 7, -1.00,  True)],
                2: [(0.8, 11, -0.04, False),(0.1, 10, -0.04, False),(0.1, 11, -0.04, False)],
                3: [(0.8, 10, -0.04,False),(0.1, 11, -0.04, False),(0.1, 7, -1.00,  True)]
            },
        }
        
        self.size = 12 # The size of the grid
        #self.window_size = 512  # The size of the PyGame window
        
        # We have 3 observations, corresponding to each position in the 1-D grid
        self.observation_space = spaces.Discrete(self.size)

        # We have 2 actions, corresponding to "left" & "right"
        self.action_space = spaces.Discrete(4)
        self.action_space_size_ = 4 
        
    
    #return the locations of agent and target
    def _get_obs(self):
        return {   
            "agent" : self._agent_location, 
            "target": self._target_location  
        }
    
    #returns the distance between agent and target 
    def _get_info(self):
        return {  
            "distance": abs(self._agent_location - self._target_location)   
        }
    
    
    
    
    #----- 2 ------
    # The reset function to initiate 
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        
        self._agent_location = 8             #location of agent in the begining
        self._target_location = 3            #location of target  
        self._dead_state = 7                 #dead location
        
        
        observation = self._get_obs()        #getting useful information
        info = self._get_info()

        return observation,info
    
    
    
    
    
    #------- 3 ---------
    # The step function 
    
    def step(self, action):  # takes action as a parameter

        # gets the current location and stores the values from P set 
        prev_location = self._agent_location                                #gets location
        transitions = self.P[prev_location][action]                         #gets the corresponding action tuple
        probabilities, next_states, rewards, terminals = zip(*transitions)  #stores the value for use 
        
        # Randomly select a transition based on the probabilities
        # gives you random state based on your probabilities 
        index = random.choices(range(len(probabilities)), weights=probabilities, k=1)[0]
        # stores the values 
        self._agent_location, reward, terminated = next_states[index], rewards[index], terminals[index]
        
        truncated = False
        observation = self._get_obs()  
        info = self._get_info()

        info["log"] = {"current_state": prev_location, 
                       "action":action,  
                        "next_state": self._agent_location}

        # Return the required 5-tuple
        return observation, reward, terminated, truncated, info


In [8]:
# Register the custom environment
register(id='RMaze', entry_point=RandomMaze)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [9]:
# Create and use the environment
environment = make('RMaze')

reward_sum = 0
observation = environment.reset(seed=0)
for _ in range(10):
    action = environment.action_space.sample()  # this is where you would insert your policy
    observation, reward, terminated, truncated, info = environment.step(action)
    reward_sum += reward
    print(info['log'])

    if terminated:
        print("Terminated", "\n")

    
    if terminated or truncated:
        observation = environment.reset(seed=0)
        
        
print(" Average Reward over 10 episode = " , (reward_sum * 0.1))

{'current_state': 8, 'action': 1, 'next_state': 9}
{'current_state': 9, 'action': 0, 'next_state': 9}
{'current_state': 9, 'action': 3, 'next_state': 8}
{'current_state': 8, 'action': 2, 'next_state': 8}
{'current_state': 8, 'action': 1, 'next_state': 9}
{'current_state': 9, 'action': 2, 'next_state': 9}
{'current_state': 9, 'action': 1, 'next_state': 10}
{'current_state': 10, 'action': 3, 'next_state': 9}
{'current_state': 9, 'action': 2, 'next_state': 9}
{'current_state': 9, 'action': 3, 'next_state': 8}
 Average Reward over 10 episode =  -0.04
