<a href="https://colab.research.google.com/github/rishabhsingroha/RL/blob/main/exp7_21csu454.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Define the MDP parameters for a 2x2 grid world with 4 actions (up, down, left, right)
num_states = 4
num_actions = 4

# Transition probabilities: transition_probs[state, action, next_state]
# Example: If you move up (action 0) from state 0 (bottom-left), you have a 0.8 chance of staying in the same state (state 0) and a 0.2 chance of moving to state 1 (top-left).
transition_probs = np.array([
    [[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]],
    [[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]],
    [[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]],
    [[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]]
])

# Rewards: rewards[state, action, next_state]
# Example: If you move up (action 0) from state 0 to state 0, you receive a reward of -1.
rewards = np.array([
    [[-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1]],
    [[-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1]],
    [[-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1]],
    [[-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1]]
])

# Initialize a random policy
policy = np.random.randint(0, num_actions, size=num_states)

# Set the discount factor (gamma)
gamma = 0.9

# Policy Iteration Algorithm
def policy_iteration(policy, num_states, num_actions, transition_probs, rewards, gamma, max_iterations=1000):
    for _ in range(max_iterations):
        # Policy Evaluation Step
        while True:
            delta = 0
            values = np.zeros(num_states)  # Initialize state values to zeros
            for state in range(num_states):
                v = 0
                for action in range(num_actions):
                    for next_state in range(num_states):
                        v += transition_probs[state, action, next_state] * (rewards[state, action, next_state] + gamma * values[next_state])
                delta = max(delta, abs(v - values[state]))
                values[state] = v
            if delta < 1e-6:
                break

        # Policy Improvement Step
        policy_stable = True
        for state in range(num_states):
            old_action = policy[state]
            action_values = np.zeros(num_actions)
            for action in range(num_actions):
                for next_state in range(num_states):
                    action_values[action] += transition_probs[state, action, next_state] * (rewards[state, action, next_state] + gamma * values[next_state])
            policy[state] = np.argmax(action_values)
            if old_action != policy[state]:
                policy_stable = False
        if policy_stable:
            break

    return policy

# Call the policy iteration function
optimal_policy = policy_iteration(policy, num_states, num_actions, transition_probs, rewards, gamma)

# Print the optimal policy
print("Optimal Policy:", optimal_policy)
