<a href="https://colab.research.google.com/github/somu-ncu/RL_21CSU409/blob/main/Experiment_8_Value_Iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Define the MDP (Transition probabilities, rewards, discount factor)
num_states = 11
num_actions = 4

# Define the state space
states = [(i, j) for i in range(3) for j in range(4) if not (i == 1 and j == 1)]

# Define the rewards
rewards = {
    (0, 3): 1,
    (1, 3): -1,
}

# Define the transition probabilities
def transition_probabilities(state, action):
    i, j = state
    if state in rewards:
        return [(state, 1.0)]  # Terminal state
    if action == 'Up':
        next_state = (max(i - 1, 0), j)
    elif action == 'Down':
        next_state = (min(i + 1, 2), j)
    elif action == 'Left':
        next_state = (i, max(j - 1, 0))
    elif action == 'Right':
        next_state = (i, min(j + 1, 3))
    else:
        raise ValueError("Invalid action")
    if next_state not in states:
        return [(state, 1.0)]  # Stay in the current state if the action leads to an invalid state
    return [(next_state, 1.0)]

# Discount factor
gamma = 0.9

def value_iteration(states, rewards, gamma, theta):
    V = {state: 0 for state in states}

    iteration = 0
    while True:
        delta = 0
        for state in states:
            v = V[state]
            action_values = {}
            if state in rewards:
                continue
            for action in ['Up', 'Down', 'Left', 'Right']:
                transitions = transition_probabilities(state, action)
                action_values[action] = sum(prob * (rewards.get(next_state, 0) + gamma * V.get(next_state, 0)) for next_state, prob in transitions)
            V[state] = max(action_values.values())
            delta = max(delta, abs(v - V[state]))

        # Print the current value matrix
        print(f"Iteration {iteration} - Value Matrix:")
        value_matrix = np.zeros((3, 4))
        for state, value in V.items():
            i, j = state
            value_matrix[i][j] = value
        print(value_matrix)
        iteration += 1

        if delta < theta:
            break

    # Set the value of the goal state to its reward after convergence
    for goal_state, reward in rewards.items():
        V[goal_state] = reward

    return V

theta = 0.0001

# Run value iteration
optimal_values = value_iteration(states, rewards, gamma, theta)

# Print the final value function
print("\nOptimal Value Function:")
for state, value in optimal_values.items():
    value = value
    print(f"State {state}: {value:.3f}")

Iteration 0 - Value Matrix:
[[0.    0.    1.    0.   ]
 [0.    0.    0.9   0.   ]
 [0.    0.    0.81  0.729]]
Iteration 1 - Value Matrix:
[[0.    0.9   1.    0.   ]
 [0.    0.    0.9   0.   ]
 [0.    0.729 0.81  0.729]]
Iteration 2 - Value Matrix:
[[0.81   0.9    1.     0.    ]
 [0.729  0.     0.9    0.    ]
 [0.6561 0.729  0.81   0.729 ]]
Iteration 3 - Value Matrix:
[[0.81   0.9    1.     0.    ]
 [0.729  0.     0.9    0.    ]
 [0.6561 0.729  0.81   0.729 ]]

Optimal Value Function:
State (0, 0): 0.810
State (0, 1): 0.900
State (0, 2): 1.000
State (0, 3): 1.000
State (1, 0): 0.729
State (1, 2): 0.900
State (1, 3): -1.000
State (2, 0): 0.656
State (2, 1): 0.729
State (2, 2): 0.810
State (2, 3): 0.729
