<a href="https://colab.research.google.com/github/rahul-727/Reinforcement-Learning-/blob/main/2348544_Lab6_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [2]:
class GridWorldEnv:
    def __init__(self, grid_size=(4, 4), goal_state=(3, 3), start_state=(0, 0)):
        self.grid_size = grid_size
        self.goal_state = goal_state
        self.start_state = start_state
        self.current_state = start_state
        self.action_space = ['up', 'down', 'left', 'right']
        self.state_space = [(i, j) for i in range(grid_size[0]) for j in range(grid_size[1])]
        self.transitions = self._create_transitions()

    def _create_transitions(self):
        """Precomputes and stores all possible state transitions for every state-action pair"""
        transitions = {}
        for state in self.state_space:
            transitions[state] = {}
            for action in self.action_space:
                next_state = self._next_state(state, action)
                transitions[state][action] = next_state
        return transitions

    def _next_state(self, state, action):
        """Determine the next state based on the current state and action."""
        x, y = state
        if action == 'up':
            return max(x - 1, 0), y
        elif action == 'down':
            return min(x + 1, self.grid_size[0] - 1), y
        elif action == 'left':
            return x, max(y - 1, 0)
        elif action == 'right':
            return x, min(y + 1, self.grid_size[1] - 1)

    def reset(self):
        """Reset the environment to the starting state."""
        self.current_state = self.start_state
        return self.current_state

    def step(self, action):
        """Take an action and return the new state, reward, and whether the goal is reached
        Calculates the reward:
                       A positive reward (+1) if the agent reaches the goal.
                       A small penalty (-0.01) otherwise to encourage efficiency
                       Checks if the goal is reached (done = True), else continues (done = False)"""
        if action not in self.action_space:
            raise ValueError(f"Invalid action: {action}")

        next_state = self.transitions[self.current_state][action]
        self.current_state = next_state

        if self.current_state == self.goal_state:
            reward = 1
            done = True
        else:
            reward = -0.01  # Small penalty to encourage shorter paths
            done = False

        return next_state, reward, done

    def render(self):
        """Render the current state of the grid.
        display the current grid with agent A and the goal G"""
        grid = np.zeros(self.grid_size, dtype=str)
        grid[:] = '-'
        x, y = self.current_state
        gx, gy = self.goal_state
        grid[gx, gy] = 'G'
        grid[x, y] = 'A'
        print('\n'.join([' '.join(row) for row in grid]))
        print()

In [3]:
if __name__ == "__main__":
    env = GridWorldEnv()
    state = env.reset()
    env.render()

    done = False
    while not done:
        action = np.random.choice(env.action_space)  # Random policy
        next_state, reward, done = env.step(action)
        print(f"Action: {action}, Next State: {next_state}, Reward: {reward}, Done: {done}")
        env.render()

A - - -
- - - -
- - - -
- - - G

Action: down, Next State: (1, 0), Reward: -0.01, Done: False
- - - -
A - - -
- - - -
- - - G

Action: right, Next State: (1, 1), Reward: -0.01, Done: False
- - - -
- A - -
- - - -
- - - G

Action: right, Next State: (1, 2), Reward: -0.01, Done: False
- - - -
- - A -
- - - -
- - - G

Action: left, Next State: (1, 1), Reward: -0.01, Done: False
- - - -
- A - -
- - - -
- - - G

Action: left, Next State: (1, 0), Reward: -0.01, Done: False
- - - -
A - - -
- - - -
- - - G

Action: left, Next State: (1, 0), Reward: -0.01, Done: False
- - - -
A - - -
- - - -
- - - G

Action: up, Next State: (0, 0), Reward: -0.01, Done: False
A - - -
- - - -
- - - -
- - - G

Action: left, Next State: (0, 0), Reward: -0.01, Done: False
A - - -
- - - -
- - - -
- - - G

Action: down, Next State: (1, 0), Reward: -0.01, Done: False
- - - -
A - - -
- - - -
- - - G

Action: up, Next State: (0, 0), Reward: -0.01, Done: False
A - - -
- - - -
- - - -
- - - G

Action: up, Next State: (0, 0