In [1]:
gamma = 0.9

states = ["A", "B", "T"]
actions = ["left", "right"]

P = {
    "A": {
        "left": ("A", 0.0),
        "right": ("B", 0.0),
    },
    "B": {
        "left": ("A", 0.0),
        "right": ("T", 1.0),
    },
    "T": {}                   #No aciton from the terminal position
}

#initialize valuse to 0
V = {s: 0.0 for s in states}

In [2]:
def value_iteration(P, states, actions, gamma=0.9, theta=1e-6, max_iters=1000):
    V = {s: 0.0 for s in states}

    for _ in range(max_iters):
        delta = 0.0
        new_V = V.copy()
        for s in states:
            #if terminal action, exit from the loop
            if s == "T":
                continue

            #for this state, compute value of each possible action
            action_values = []
            for a in actions:
                if a not in P[s]:
                    continue      #skip invalid actions
                next_state, reward = P[s][a]
                q_sa = reward + gamma*V[next_state]
                action_values.append(q_sa)

            #best action value becomes new V(s)
            best_value = max(action_values)
            new_V[s] = best_value

            #track change for convergence check
            delta = max(delta, abs(best_value - V[s]))

        V = new_V
        if delta < theta:
            break

    return V

In [20]:
#getting optimal values and policy
V_opt = value_iteration(P, states, actions, gamma=0.9)
print("Optimal State values: ", V_opt)

#derive greedy policy from V_opt
policy = {}
for s in states:
    if s == "T":
        policy[s] = None
        continue

    best_a = None
    best_q = float("-inf")
    for a in actions:
        if a not in P[s]:
            continue
        next_state, reward = P[s][a]
        print(f"Next_state {next_state} and reward {reward} for state {s} taking action {a}:")
        q_sa = reward + gamma*V_opt[next_state]
        print("q_sa: ",q_sa)
        if q_sa > best_q:
            best_q = q_sa
            best_a = a
    policy[s] = best_a
    
print("Greedy policy:",policy)

Optimal State values:  {'A': 0.9, 'B': 1.0, 'T': 0.0}
Next_state A and reward 0.0 for state A taking action left:
q_sa:  0.81
Next_state B and reward 0.0 for state A taking action right:
q_sa:  0.9
Next_state A and reward 0.0 for state B taking action left:
q_sa:  0.81
Next_state T and reward 1.0 for state B taking action right:
q_sa:  1.0
Greedy policy: {'A': 'right', 'B': 'right', 'T': None}


In [14]:
policy["T"] = "left"

In [15]:
policy

{'A': 'right', 'B': 'right', 'T': 'left'}

In [19]:
test_policy={}
for s in states:
    for a in actions:
        test_policy[s] = a
        print("test_policy: ", test_policy)

test_policy:  {'A': 'left'}
test_policy:  {'A': 'right'}
test_policy:  {'A': 'right', 'B': 'left'}
test_policy:  {'A': 'right', 'B': 'right'}
test_policy:  {'A': 'right', 'B': 'right', 'T': 'left'}
test_policy:  {'A': 'right', 'B': 'right', 'T': 'right'}
