In [1]:
import numpy as np

def create_gridworld(size=4):
    # Create state values array
    V = np.zeros(size*size)

    # Create transition probabilities and rewards
    def get_next_states(s):
        row, col = s // size, s % size
        possible_moves = []

        # Up
        if row > 0:
            possible_moves.append(s - size)
        # Down
        if row < size - 1:
            possible_moves.append(s + size)
        # Left
        if col > 0:
            possible_moves.append(s - 1)
        # Right
        if col < size - 1:
            possible_moves.append(s + 1)

        return possible_moves

    # Value Iteration
    gamma = 1.0
    threshold = 1e-4
    max_change = float('inf')

    while max_change >= threshold:
        max_change = 0
        V_new = V.copy()

        for s in range(size*size):
            # Skip terminal state (bottom-right corner)
            if s == size*size - 1:
                continue

            # Get possible next states
            next_states = get_next_states(s)
            n_actions = len(next_states)

            # Calculate value using Bellman equation
            value = 0
            for next_s in next_states:
                # Probability of each action is 1/n_actions
                # Reward is -1 for each move

                value += (1/n_actions) * (-1 + gamma * V[next_s])

            V_new[s] = value
            max_change = max(max_change, abs(V_new[s] - V[s]))

        V = V_new

    return V

# Run the algorithm
V = create_gridworld(4)

# Print results in a grid format
for i in range(4):
    for j in range(4):
        print(f"{V[i*4 + j]:.2f}", end="\t")
    print()

-44.57	-43.57	-41.21	-38.78	
-43.57	-41.93	-38.28	-34.35	
-41.21	-38.28	-31.64	-23.00	
-38.78	-34.35	-23.00	0.00	
