**Objective:** 
Your task is to program an agent to find the optimal policy for navigating a labyrinth from a specified starting point to a goal point using the Value Iteration algorithm.

**Step 1: Familiarize with the Environment**
- Understand the structure of the `Labyrinth` class and how it represents the labyrinth environment, including walls, the starting point, and the goal.
- Familiarize yourself with how the `Agent` class is structured, and how it interacts with the labyrinth environment.

**Step 2: Implement Value Iteration**
- Create a function or method to implement the Value Iteration algorithm.
- You'll need to initialize a utility table with zeros and iteratively update the utilities of each state (i.e., each cell in the labyrinth) based on the Bellman equation.
- The stopping criterion for Value Iteration is when the maximum change in utility is less than a small threshold, say 0.01.
- Once the utilities have converged, use them to compute the optimal policy, which specifies the best action to take in each state.

**Step 3: Modify the Agent Class**
- Modify the `act` method of the `Agent` class to use the optimal policy derived from Value Iteration instead of taking random actions.
- Optionally, you can also modify the `update` method to incorporate any additional learning or updating you wish to implement.

**Step 4: Run the Simulation**
- Run the provided simulation loop, where the agent is placed in the labyrinth and must navigate to the goal.
- Observe how the agent's behavior changes as it learns the optimal policy.
- You might want to add some print statements or other logging to help visualize the agent's path through the labyrinth and how it improves over time.



In [1]:
from typing import NamedTuple

import random
import time

import numpy as np


# intruduced a nemad tuple for actions
class Action(NamedTuple):
    delta_col: int
    delta_row: int

    def __str__(self):
        return f'Action({self.delta_row}, {self.delta_col})'


class Labyrinth:
    def __init__(self, rows, cols, walls, start, goal):
        self.grid = np.zeros((rows, cols))
        for wall in walls:
            self.grid[wall] = -1  # Assign -1 for walls
        self.start = start
        self.goal = goal
        self.current_position = start

    def reset(self):
        self.current_position = self.start
        return self.current_position

    def step(self, action):
        # Assume actions are encoded as (delta_row, delta_col)
        new_position = (
            self.current_position[0] + action[0],
            self.current_position[1] + action[1],
        )
        if self.is_valid_move(new_position):
            self.current_position = new_position
        reward = 1 if self.current_position == self.goal else 0
        return self.current_position, reward

    def is_valid_move(self, position):
        rows, cols = self.grid.shape
        return (
            0 <= position[0] < rows
            and 0 <= position[1] < cols
            and self.grid[position] != -1
        )

    def done(self):
        return self.current_position == self.goal

    def render(self, episode, t, action):
        print(f'{episode=}, {t=}')
        print(action)
        grid_copy = self.grid.copy()
        grid_copy[self.current_position] = 2
        grid_copy[self.goal] = 9
        print(grid_copy)
        time.sleep(1)
        print('-' * 20)

In [2]:
class Agent:
    def __init__(self, environment: Labyrinth) -> None:
        self.start = environment.start
        self.environment = environment
        self.utility_table = np.zeros(labyrinth.grid.shape)
        for wall in environment.grid == -1:
            self.utility_table[wall] = -1

        self.utility_table[labyrinth.goal] = 1

        self.possible_moves = {
            Action(0, 1),
            Action(0, -1),
            Action(1, 0),
            Action(-1, 0),
        }
        self.curr_pos = self.start

    def reset(self):
        self.curr_pos = self.start

    def act(self, state) -> Action:
        best_action = None
        posible_moves = list(self.possible_moves)
        curr_best_pos_x, curr_best_pos_y = state
        random.shuffle(posible_moves)
        for action in posible_moves:
            new_pos_x = self.curr_pos[0] + action[0]
            new_pos_y = self.curr_pos[1] + action[1]

            if best_action is None:
                best_action = action
                curr_best_pos_x = new_pos_x
                curr_best_pos_y = new_pos_y

            if (
                self.utility_table[new_pos_x, new_pos_y]
                > self.utility_table[curr_best_pos_x, curr_best_pos_y]
            ):
                best_action = action
                curr_best_pos_x = new_pos_x
                curr_best_pos_y = new_pos_y

        return best_action

        # return random.choice([
        #     Action(0, 1),
        #     Action(0, -1),
        #     Action(1, 0),
        #     Action(-1, 0),
        # ])  # Random action for demonstration

    def update(self, action: Action, state, reward) -> None:
        self.utility_table[state] = reward
        # Update any agent state here


# Define labyrinth

walls = {(1, 1), (2, 1), (1, 2)}

labyrinth = Labyrinth(rows=4, cols=4, walls=walls, start=(0, 0), goal=(3, 3))
agent = Agent(labyrinth)

MAX_EPISODES = 1000
T = 100

for episode in range(MAX_EPISODES):
    state = labyrinth.reset()
    agent.reset()
    for t in range(T):
        action = agent.act(state)
        state, reward = labyrinth.step(action)
        agent.update(action, state, reward)
        labyrinth.render(episode=episode, t=t, action=action)
        if labyrinth.done():
            break

episode=0, t=0
Action(1, 0)
[[ 0.  2.  0.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------------------
episode=0, t=1
Action(-1, 0)
[[ 2.  0.  0.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------------------
episode=0, t=2
Action(1, 0)
[[ 0.  2.  0.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------------------
episode=0, t=3
Action(0, -1)
[[ 0.  2.  0.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------------------
episode=0, t=4
Action(0, -1)
[[ 0.  2.  0.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------------------
episode=0, t=5
Action(1, 0)
[[ 0.  0.  2.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------------------
episode=0, t=6
Action(-1, 0)
[[ 0.  2.  0.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------------------
episode=0, t=7
Action(0, -1)
[[ 0.  2.  0.  0.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  9.]]
--------

KeyboardInterrupt: 