<a href="https://colab.research.google.com/github/sathwikreddykatla/tutorial/blob/master/RL_Assignment6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

class GridWorldMDP:
    def __init__(self, grid_size, start_state, terminal_states, rewards, actions, gamma=0.9):
        self.grid_size = grid_size
        self.start_state = start_state
        self.terminal_states = terminal_states
        self.rewards = rewards
        self.actions = actions
        self.gamma = gamma

        self.state_values = np.zeros(grid_size)
        self.policy = np.random.choice(len(actions), size=grid_size)

    def value_iteration(self, epsilon=0.01):
        while True:
            delta = 0
            for i in range(self.grid_size[0]):
                for j in range(self.grid_size[1]):
                    state = (i, j)
                    if state in self.terminal_states:
                        continue
                    old_value = self.state_values[i, j]
                    action_values = []
                    for action in self.actions:
                        next_state, reward = self.take_action(state, action)
                        action_value = reward + self.gamma * self.state_values[next_state[0], next_state[1]]
                        action_values.append(action_value)
                    self.state_values[i, j] = max(action_values)
                    delta = max(delta, abs(old_value - self.state_values[i, j]))
            if delta < epsilon:
                break

    def take_action(self, state, action):
        if state in self.terminal_states:
            return state, 0
        next_state = (state[0] + action[0], state[1] + action[1])
        if next_state[0] < 0 or next_state[0] >= self.grid_size[0] or next_state[1] < 0 or next_state[1] >= self.grid_size[1]:
            next_state = state
        reward = self.rewards.get(next_state, -1)
        return next_state, reward

    def find_optimal_policy(self):
        for i in range(self.grid_size[0]):
            for j in range(self.grid_size[1]):
                state = (i, j)
                if state in self.terminal_states:
                    continue
                action_values = []
                for action in self.actions:
                    next_state, reward = self.take_action(state, action)
                    action_value = reward + self.gamma * self.state_values[next_state[0], next_state[1]]
                    action_values.append((action, action_value))
                best_action = max(action_values, key=lambda x: x[1])[0]
                self.policy[i, j] = self.actions.index(best_action)

    def print_policy(self):
        for i in range(self.grid_size[0]):
            for j in range(self.grid_size[1]):
                state = (i, j)
                if state in self.terminal_states:
                    print('T', end=' ')
                else:
                    action = self.actions[self.policy[i, j]]
                    if action == (0, 1):
                        print('>', end=' ')
                    elif action == (0, -1):
                        print('<', end=' ')
                    elif action == (-1, 0):
                        print('^', end=' ')
                    elif action == (1, 0):
                        print('v', end=' ')
            print()

# Example Usage
grid_size = (4, 4)
start_state = (0, 0)
terminal_states = {(0, 3): 1, (3, 3): -1}  # terminal_states: {state: reward}
rewards = {}
actions = [(0, 1), (0, -1), (-1, 0), (1, 0)]  # right, left, up, down

grid_world = GridWorldMDP(grid_size, start_state, terminal_states, rewards, actions)
grid_world.value_iteration()
grid_world.find_optimal_policy()

print("State Values:")
print(grid_world.state_values)
print("\nOptimal Policy:")
grid_world.print_policy()


State Values:
[[-2.71  -1.9   -1.     0.   ]
 [-3.439 -2.71  -1.9   -1.   ]
 [-3.439 -2.71  -1.9   -1.   ]
 [-2.71  -1.9   -1.     0.   ]]

Optimal Policy:
> > > T 
> > > ^ 
> > > v 
> > > T 
