In [3]:
import numpy as np

# ================================
# POLICY ITERATION for the given MDP
# ================================

# States: S1, S2, S3, S4, G
states = ['S1', 'S2', 'S3', 'S4', 'G']
n_states = len(states)

# Actions: a1, a2
actions = ['a1', 'a2']

# Discount factor
gamma = 0.9

# Transition probabilities T[s][a] = [(next_state, prob)]
T = {
    'S1': {
        'a1': [('S2', 0.8), ('S3', 0.2)],
        'a2': [('S3', 0.7), ('S4', 0.3)]
    },
    'S2': {
        'a1': [('S1', 0.5), ('S3', 0.4), ('G', 0.1)],
        'a2': [('S3', 0.9), ('S4', 0.1)]
    },
    'S3': {
        'a1': [('S2', 0.6), ('G', 0.4)],
        'a2': [('S4', 1.0)]
    },
    'S4': {
        'a1': [('G', 1.0)]
    },
    'G': {}
}

# Reward (or cost): -2 per step (since cost = 2)
reward = -2

# Initialize policy arbitrarily: π(s) = a1 for all non-goal states
policy = {s: 'a1' for s in states if s != 'G'}

def policy_evaluation(policy, V, theta=1e-6):
    """Evaluate a given policy until convergence."""
    while True:
        delta = 0
        for s in states:
            if s == 'G':
                continue
            v = V[s]
            a = policy[s]
            V[s] = 0
            for s_next, p in T[s][a]:
                r = reward
                V[s] += p * (r + gamma * V[s_next])
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    return V

def policy_improvement(V, policy):
    """Improve the policy based on current value function."""
    policy_stable = True
    for s in states:
        if s == 'G':
            continue
        old_action = policy[s]
        action_values = {}
        for a in T[s].keys():
            val = 0
            for s_next, p in T[s][a]:
                r = reward
                val += p * (r + gamma * V[s_next])
            action_values[a] = val
        # choose action with maximum value (since rewards are negative, this means minimum cost)
        best_action = max(action_values, key=action_values.get)
        policy[s] = best_action
        if old_action != best_action:
            policy_stable = False
    return policy, policy_stable

# ================================
# Main Policy Iteration Loop
# ================================
V = {s: 0 for s in states}
iteration = 0

while True:
    iteration += 1
    V = policy_evaluation(policy, V)
    policy, stable = policy_improvement(V, policy)
    if stable:
        break

# ================================
# Results
# ================================
print("Converged after", iteration, "iterations")
print("\nOptimal Policy π*:")
for s in policy:
    print(f"  {s}: {policy[s]}")

print("\nOptimal Value Function V*:")
for s in V:
    print(f"  {s}: {V[s]:.3f}")


Converged after 2 iterations

Optimal Policy π*:
  S1: a2
  S2: a2
  S3: a2
  S4: a1

Optimal Value Function V*:
  S1: -4.934
  S2: -5.258
  S3: -3.800
  S4: -2.000
  G: 0.000
