# World

|       | reward + 1 | Penalty -1 |
|-------|------------|------------|
|       |            |            |
|       | wall       |            |
| start |            |            |

In [None]:
"""1. Write a code in Python implementing Value iteration for a grid world given in the
image above
Values you will require:
1. The reward for reaching the goal state = 1
2. The penalty for reaching the red state = -1
3. Step cost = -0.04
4. Probability of going in the direction of the action = 0.7
5. Probability of going in a direction perpendicular to the action = 0.15
Print the utility value of each cell in the grid after each iteration until the values
converge. (Assume the values converge when the difference between the utilities
for each cell is <= 0.0001)
Note: The agent does not change its state if it hits a wall or the boundaries"""



In [18]:
# latest

import numpy as np

# Define the grid world
WORLD = np.array([[0, 1, -1],
                  [0, 0, 0],
                  [0, float('-inf'), 0],
                  [float('-inf'), 0, 0]])
# print(WORLD)

# Define the action set
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']

# Define the probabilities for action and perpendicular action
PROB_ACTION = 0.7
PROB_PERP_ACTION = 0.15

# Define the rewards and penalties
REWARD_GOAL = 1
PENALTY_RED = -1
STEP_COST = -0.04

# Define the convergence threshold
EPSILON = 0.0001

# Function to check if a state is valid
def is_valid_state(state, world):
    row, col = state
    rows, cols = world.shape
    if row < 0 or row >= rows or col < 0 or col >= cols or world[row, col] == float('-inf'):
        return False
    return True

# Initialize utility values for each cell in the grid
utilities = np.zeros_like(WORLD)

# Perform Value Iteration
delta = float('inf')
while delta > EPSILON:
    delta = 0
    for row in range(WORLD.shape[0]):
        for col in range(WORLD.shape[1]):
            if not is_valid_state((row, col), WORLD):
                continue

            prev_utility = utilities[row, col]

            max_action_utility = float('-inf')
            for action in ACTIONS:
                next_row, next_col = row, col
                if action == 'UP':
                    next_row -= 1
                elif action == 'DOWN':
                    next_row += 1
                elif action == 'LEFT':
                    next_col -= 1
                elif action == 'RIGHT':
                    next_col += 1

                next_state = (next_row, next_col)

                # Calculate the estimated utility for the current action
                estimated_utility = 0
                for act in ACTIONS:
                    if act == action:
                        prob = PROB_ACTION
                    else:
                        prob = PROB_PERP_ACTION
                    next_row, next_col = row, col
                    if act == 'UP':
                        next_row -= 1
                    elif act == 'DOWN':
                        next_row += 1
                    elif act == 'LEFT':
                        next_col -= 1
                    elif act == 'RIGHT':
                        next_col += 1

                    next_state = (next_row, next_col)
                    if is_valid_state(next_state, WORLD):
                        estimated_utility += prob * utilities[next_row, next_col]
                    else:
                        estimated_utility += prob * prev_utility

                # Update the maximum action utility value
                max_action_utility = max(max_action_utility, estimated_utility)

            # Update the utility value for the current state
            if WORLD[row, col] == REWARD_GOAL:
                utilities[row, col] = REWARD_GOAL
            elif WORLD[row, col] == PENALTY_RED:
                utilities[row, col] = PENALTY_RED
            else:
                utilities[row, col] = STEP_COST + max_action_utility
    print(utilities)


[[-0.04       1.        -1.       ]
 [-0.046      0.6531     0.26717  ]
 [-0.0469     0.         0.147019 ]
 [ 0.        -0.04       0.0569133]]


In [6]:
import numpy as np

# Define the grid world
WORLD = np.array([[0, 1, -1],
                  [0, 0, 0],
                  [0, float('-inf'), 0],
                  ['x', 0, 0]])

# Define the action set
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']

# Define the probabilities for action and perpendicular action
PROB_ACTION = 0.7
PROB_PERP_ACTION = 0.15

# Define the rewards and penalties
REWARD_GOAL = 1
PENALTY_RED = -1
STEP_COST = -0.04

# Define the convergence threshold
EPSILON = 0.0001

# Function to check if a state is valid
def is_valid_state(state, world):
    """
    Check if a state is valid (within the bounds of the grid).
    Args:
        state (tuple): Current state (row, col) in the grid.
        world (numpy.ndarray): Grid world representation.
    Returns:
        bool: True if the state is valid, False otherwise.
    """
    row, col = state
    rows, cols = world.shape
    if row < 0 or row >= rows or col < 0 or col >= cols or world[row, col] == float('-inf'):
        return False
    return True

# Initialize utility values for each cell in the grid
utilities = np.zeros_like(WORLD)

# Set the start state
start_state = (3, 0)

# Perform Value Iteration
delta = float('inf')
while delta > EPSILON:
    delta = 0
    i = 0
    policy = np.empty_like(WORLD, dtype=np.object)
    for row in range(WORLD.shape[0]):
        for col in range(WORLD.shape[1]):
            if not is_valid_state((row, col), WORLD):
                policy[row, col] = 'WALL'
                continue

            prev_utility = utilities[row, col]

            max_action_utility = float('-inf')
            best_action = None
            estimated_utility = 0
            for act in ACTIONS:
                    if act == action:
                        prob = PROB_ACTION
                    else:
                        prob = PROB_PERP_ACTION
                    next_row, next_col = row, col
                    if act == 'UP':
                        next_row -= 1
                    elif act == 'DOWN':
                        next_row += 1
                    elif act == 'LEFT':
                        next_col -= 1
                    elif act == 'RIGHT':
                        next_col += 1

                    next_state = (next_row, next_col)
                    if is_valid_state(next_state, WORLD):
                        estimated_utility += prob * utilities[next_row, next_col]
                    else:
                        estimated_utility += prob * prev_utility

                # Update the maximum action utility value and the best action
            if estimated_utility > max_action_utility:
                    max_action_utility = estimated_utility
                    best_action = action
            policy[row, col] = best_action

            # Update the utility value for the current state
        if WORLD[row, col] == REWARD_GOAL:
                utilities[row, col] = REWARD_GOAL
        elif WORLD[row, col] == PENALTY_RED:
                utilities[row, col] = PENALTY_RED
        else:
                utilities[row, col] = STEP_COST + max_action_utility

            # Update the delta (maximum change in utility value)
        delta = max(delta, abs(prev_utility - utilities[row, col]))

    # Print the utility values after each iteration
        print("Iteration: ", i + 1)
        i += 1 
    print(utilities)

    # Check for convergence
    if delta <= EPSILON:
        print("Convergence achieved after", i + 1, "iterations.")
        break
'''
# Print the final policy
policy = np.empty_like(WORLD, dtype=np.object)
for row in range(WORLD.shape[0]):
    for col in range(WORLD.shape[1]):
        if not is_valid_state((row, col), WORLD):
            policy[row, col] = 'WALL'
        else:
            max_action_utility = float('-inf')
            best_action = None
            for action in ACTIONS:
                next_row, next_col = row, col
                if action == 'UP':
                    next_row -= 1
                elif action == 'DOWN':
                    next_row += 1
                elif action == 'LEFT':
                    next_col -= 1
                elif action == 'RIGHT':
                    next_col += 1

                next_state = (next_row, next_col)

                # Calculate the estimated utility for the current action
                estimated_utility = 0
                for act in ACTIONS:
                    if act == action:
                        prob = PROB_ACTION
                    else:
                        prob = PROB_PERP_ACTION
                    next_row, next_col = row, col
                    if act == 'UP':
                        next_row -= 1
                    elif act == 'DOWN':
                        next_row += 1
                    elif act == 'LEFT':
                        next_col -= 1
                    elif act == 'RIGHT':
                        next_col += 1

                    next_state = (next_row, next_col)
                    if is_valid_state(next_state, WORLD):
                        estimated_utility += prob * utilities[next_row, next_col]
                    else:
                        estimated_utility += prob * utilities[row, col]

                # Update the maximum action utility value and the best action
                if estimated_utility > max_action_utility:
                    max_action_utility = estimated_utility
                    best_action = action

            # Set the best action as the policy for the current state
            policy[row, col] = best_action
'''
# Print the final policy
print("Final Policy:")
print(policy)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  policy = np.empty_like(WORLD, dtype=np.object)


TypeError: can't multiply sequence by non-int of type 'float'