In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random

class GridWorld:
    def __init__(self, width, height, start, goal, obstacles):
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.grid = np.zeros((height, width))
        for obstacle in obstacles:
            self.grid[obstacle] = -1  # Mark obstacles

    def is_terminal(self, state):
        return state == self.goal

    def get_possible_actions(self, state):
        actions = []
        x, y = state
        if x > 0 and self.grid[y, x-1] != -1:
            actions.append((-1, 0))  # Left
        if x < self.width - 1 and self.grid[y, x+1] != -1:
            actions.append((1, 0))   # Right
        if y > 0 and self.grid[y-1, x] != -1:
            actions.append((0, -1))  # Up
        if y < self.height - 1 and self.grid[y+1, x] != -1:
            actions.append((0, 1))   # Down
        return actions

    def transition(self, state, action):
        return (state[0] + action[0], state[1] + action[1])

def rtdp(grid_world, max_trials=1000, discount_factor=0.9):
    V = np.zeros((grid_world.height, grid_world.width))  # Value function
    policy = np.full((grid_world.height, grid_world.width), None)  # Policy

    for _ in range(max_trials):
        state = grid_world.start
        while not grid_world.is_terminal(state):
            actions = grid_world.get_possible_actions(state)
            if not actions:
                break
            action = random.choice(actions)
            next_state = grid_world.transition(state, action)

            # Reward mechanism
            reward = 0 if next_state == grid_world.goal else -1

            # Update value function
            V[state[1], state[0]] = reward + discount_factor * V[next_state[1], next_state[0]]

            # Update policy
            policy[state[1], state[0]] = action
            state = next_state

    return V, policy

def visualize_policy(grid_world, policy):
    direction_map = {
        (-1, 0): '<',
        (1, 0): '>',
        (0, -1): '^',
        (0, 1): 'v'
    }
    for y in range(grid_world.height):
        for x in range(grid_world.width):
            if (x, y) in grid_world.obstacles:
                print('X', end=' ')
            elif (x, y) == grid_world.goal:
                print('G', end=' ')
            elif policy[y, x] is None:
                print('.', end=' ')
            else:
                print(direction_map[policy[y, x]], end=' ')
        print()

# Example usage
width, height = 5, 5
start = (0, 0)
goal = (4, 4)
obstacles = [(1, 1), (2, 2), (3, 3)]
grid_world = GridWorld(width, height, start, goal, obstacles)
V, policy = rtdp(grid_world)
print("Value Function:")
print(V)
print("Policy:")
visualize_policy(grid_world, policy)


Value Function:
[[ -9.74968445  -9.81751996 -10.          -9.99999088  -7.45813417]
 [ -9.93637315   0.          -9.0152291   -9.99915359  -8.78423345]
 [ -9.99140496  -8.49905365   0.          -8.78423345  -4.0951    ]
 [ -9.81751996  -9.52898713  -1.9          0.           0.        ]
 [ -4.0951      -6.5132156   -1.           0.           0.        ]]
Policy:
v < < > v 
v X > v v 
v < X > v 
v > v X v 
> ^ > > G 
