<a href="https://colab.research.google.com/github/somu-ncu/RL_21CSU409/blob/main/Practical_7_policy_and_value_iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Define a simple grid world environment as an example
class GridWorld:
    def __init__(self):
        self.nS = 16  # Number of states
        self.nA = 4   # Number of actions
        self.P = self._init_transitions()

    def _init_transitions(self):
        # Define the transition probabilities, rewards, and next states for each state-action pair
        P = {}
        for s in range(self.nS):
            P[s] = {a: [] for a in range(self.nA)}

        for s in range(self.nS):
            for a in range(self.nA):
                if s == 0:  # Terminal state
                    P[s][a] = [(1.0, s, 0, True)]
                else:
                    next_s, reward = self._take_action(s, a)
                    P[s][a] = [(1.0, next_s, reward, False)]

        return P

    def _take_action(self, state, action):
        # Define the state transitions and rewards for each action
        if state % 4 == 0:  # Leftmost column
            next_s = state
        else:
            next_s = state - 1

        if action == 0:  # Up
            next_s = max(next_s - 4, 0)
        elif action == 1:  # Right
            next_s = min(next_s + 1, 15)
        elif action == 2:  # Down
            next_s = min(next_s + 4, 15)

        if next_s == 15:  # Terminal state
            reward = 1
        else:
            reward = 0

        return next_s, reward

# Policy Iteration function (same as before)
def policy_iteration(env, gamma=0.9, max_iterations=1000):
    num_states = env.nS
    num_actions = env.nA

    # Initialize a random policy
    policy = np.ones((num_states, num_actions)) / num_actions

    for _ in range(max_iterations):
        # Policy Evaluation
        V = np.zeros(num_states)
        theta = 0.01

        while True:
            delta = 0
            for s in range(num_states):
                v = V[s]
                V[s] = sum(policy[s][a] * sum(p * (r + gamma * V[ns]) for (p, ns, r, _) in env.P[s][a]) for a in range(num_actions))
                delta = max(delta, abs(v - V[s]))

            if delta < theta:
                break

        # Policy Improvement
        policy_stable = True
        for s in range(num_states):
            old_action = np.argmax(policy[s])
            action_values = np.zeros(num_actions)

            for a in range(num_actions):
                action_values[a] = sum(p * (r + gamma * V[ns]) for (p, ns, r, _) in env.P[s][a])

            best_action = np.argmax(action_values)

            if old_action != best_action:
                policy_stable = False

            policy[s] = np.eye(num_actions)[best_action]

        if policy_stable:
            break

    return policy, V

# Example usage with the GridWorld environment
env = GridWorld()
policy, V = policy_iteration(env)
print("Optimal Policy:")
print(policy)
print("Optimal Value Function:")
print(V)


Optimal Policy:
[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]]
Optimal Value Function:
[0.         7.20272036 7.20272036 7.20272036 8.01272036 8.01272036
 8.01272036 8.01272036 8.91272036 8.91272036 8.91272036 8.91272036
 9.91272036 9.91272036 9.91272036 9.91272036]
