In [None]:
import numpy as np

# Define grid world properties
ROWS = 3
COLS = 4
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
START = (0, 0)
GOAL = (2, 3)
RED_STATE = (1, 1)
STEP_COST = -0.04
PROB_ACTION = 0.7
PROB_PERP_ACTION = 0.15
REWARD_GOAL = 1
PENALTY_RED_STATE = -1

# Define grid world as a numpy array
WORLD = np.array([
    [0, 0, 0, 1],
    [0, -1, 0, -1],
    [0, 0, 0, 0]
], dtype=np.float32)

# Initialize utility values for each cell to 0
utilities = np.zeros((ROWS, COLS), dtype=np.float32)

# Helper function to check if a state is valid (within bounds of the grid)
def is_valid_state(state):
    row, col = state
    return row >= 0 and row < ROWS and col >= 0 and col < COLS

# Perform Value Iteration
delta = float('inf')  # Initialize delta to a large value
epsilon = 0.0001  # Convergence threshold
while delta > epsilon:
    delta = 0
    for row in range(ROWS):
        for col in range(COLS):
            state = (row, col)
            if state == GOAL:
                continue  # Skip the goal state
            elif state == RED_STATE:
                reward = PENALTY_RED_STATE
            elif WORLD[row][col] == 0:
                reward = STEP_COST
            else:
                reward = 0

            max_action_utility = float('-inf')
            for action in ACTIONS:
                row_n, col_n = state  # Next state after taking action
                if action == 'UP':
                    row_n -= 1
                elif action == 'DOWN':
                    row_n += 1
                elif action == 'LEFT':
                    col_n -= 1
                elif action == 'RIGHT':
                    col_n += 1

                # Check if next state is valid (within bounds of the grid)
                if is_valid_state((row_n, col_n)):
                    # Calculate the utility for the current action
                    utility = PROB_ACTION * utilities[row_n][col_n]

                    # Calculate the utilities for the two perpendicular actions
                    row_p, col_p = state  # Perpendicular state 1
                    row_pp, col_pp = state  # Perpendicular state 2
                    if action == 'UP' or action == 'DOWN':
                        col_p -= 1
                        col_pp += 1
                    elif action == 'LEFT' or action == 'RIGHT':
                        row_p -= 1
                        row_pp += 1

                    # Check if perpendicular states are valid (within bounds of the grid)
                    if is_valid_state((row_p, col_p)):
                        utility += PROB_PERP_ACTION * utilities[row_p][col_p]
                    if is_valid_state((row_pp, col_pp)):
                        utility += PROB_PERP_ACTION * utilities[row_pp][col_pp]

                    # Update max action utility if necessary
                    if utility > max_action_utility:
                        max_action_utility = utility

            # Update the utility value for the current state
            prev_utility = utilities[row][col]
            utilities[row][col] = reward + max_action


In [None]:
import numpy as np

# Define grid world properties
ROWS = 3
COLS = 4
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
START = (0, 0)
GOAL = (2, 3)
RED_STATE = (1, 1)
STEP_COST = -0.04
PROB_ACTION = 0.7
PROB_PERP_ACTION = 0.15
REWARD_GOAL = 1
PENALTY_RED_STATE = -1

# Define grid world as a numpy array
WORLD = np.array([
    [0, 0, 0, 1],
    [0, -1, 0, -1],
    [0, 0, 0, 0]
], dtype=np.float32)

# Initialize utility values for each cell to 0
utilities = np.zeros((ROWS, COLS), dtype=np.float32)

# Helper function to check if a state is valid (within bounds of the grid)
def is_valid_state(state):
    row, col = state
    return row >= 0 and row < ROWS and col >= 0 and col < COLS

# Perform Value Iteration
delta = float('inf')  # Initialize delta to a large value
epsilon = 0.0001  # Convergence threshold
iteration = 0
while delta > epsilon:
    delta = 0
    for row in range(ROWS):
        for col in range(COLS):
            state = (row, col)
            if state == GOAL:
                continue  # Skip the goal state
            elif state == RED_STATE:
                reward = PENALTY_RED_STATE
            elif WORLD[row][col] == 0:
                reward = STEP_COST
            else:
                reward = 0

            max_action_utility = float('-inf')
            for action in ACTIONS:
                row_n, col_n = state  # Next state after taking action
                if action == 'UP':
                    row_n -= 1
                elif action == 'DOWN':
                    row_n += 1
                elif action == 'LEFT':
                    col_n -= 1
                elif action == 'RIGHT':
                    col_n += 1

                # Check if next state is valid (within bounds of the grid)
                if is_valid_state((row_n, col_n)):
                    # Calculate the utility for the current action
                    utility = PROB_ACTION * utilities[row_n][col_n]

                    # Calculate the utilities for the two perpendicular actions
                    row_p, col_p = state  # Perpendicular state 1
                    row_pp, col_pp = state  # Perpendicular state 2
                    if action == 'UP' or action == 'DOWN':
                        col_p -= 1
                        col_pp += 1
                    elif action == 'LEFT' or action == 'RIGHT':
                        row_p -= 1
                        row_pp += 1

                    # Check if perpendicular states are valid (within bounds of the grid)
                    if is_valid_state((row_p, col_p)):
                        utility += PROB_PERP_ACTION * utilities[row_p][col_p]
                    if is_valid_state((row_pp, col_pp)):
                        utility += PROB_PERP_ACTION * utilities[row_pp][col_pp]

                    # Update max action utility if necessary
                    if utility > max_action_utility:
                        max_action_utility = utility

            # Update the utility value for the current state
            prev_utility = utilities[row][col]
            utilities[row][col] = reward +
