In [3]:
# Redefine state values as 0
V = {
    "Facebook": 0,
    "Class 1": 0,
    "Class 2": 0,
    "Class 3": 0,
    "Pub": 0,
    "Pass": 0,  
    "Sleep": 0   #terminate state
}

gamma = 0.9
threshold = 0.001  # Convergence threshold

# Value iteration
def value_iteration(V, gamma, threshold):
    delta = float('inf')
    while delta > threshold:   #loop until delta > threshold
        delta = 0
        V_old = V.copy()       
        V["Facebook"] = 0.9 * (-1 + gamma * V_old["Facebook"]) + 0.1 * (-1 + gamma * V_old["Class 1"])
        V["Class 1"] = 0.5 * (-2 + gamma * V_old["Class 2"]) + 0.5 * (-2 + gamma * V_old["Facebook"])
        V["Class 2"] = 0.2 * (-2 + gamma * V_old["Sleep"]) + 0.8 * (-2 + gamma * V_old["Class 3"])
        V["Class 3"] = 0.6 * (-2 + gamma * V_old["Pass"]) + 0.4 * (-2 + gamma * V_old["Pub"])
        V["Pub"] = 0.2 * (1 + gamma * V_old["Class 1"]) + 0.4 * (1 + gamma * V_old["Class 2"]) + 0.4 * (1 + gamma * V_old["Class 3"])
        V["Pass"] = 10 + gamma * V_old["Sleep"]  # Update each state value
        delta = max(delta, max(abs(V[s] - V_old[s]) for s in V))# Calculate the maximum change(delta) in value
    return V

V_final = value_iteration(V, gamma, threshold)
V_final


{'Facebook': -7.631949946209425,
 'Class 1': -5.009590059631876,
 'Class 2': 0.9429949774771363,
 'Class 3': 4.0874269397907526,
 'Pub': 1.9093614225570108,
 'Pass': 10.0,
 'Sleep': 0}

In [1]:
import numpy as np

# define states and actions
states = ['St1', 'St2', 'St3', 'St4',  'End'] 
actions = {
    'St1': ['ToSt1', 'ToSt2'],
    'St2': ['ToSt1', 'ToSt3'],
    'St3': ['ToEnd', 'ToSt4'],
    'St4': ['ToEnd', 'ToPub'],
}

# reward
R = {
    ('St1', 'ToSt1'): -1,
    ('St1', 'ToSt2'): 0,
    ('St2', 'ToSt1'): -1,
    ('St2', 'ToSt3'): -2,
    ('St3', 'ToEnd'): 0,
    ('St3', 'ToSt4'): -2,
    ('St4', 'ToEnd'): 10,
    ('St4', 'ToPub'): 1,
}

# P(s'|s, a) = 1
P = {
    ('St1', 'ToSt1'): {'St1': 1.0},
    ('St1', 'ToSt2'): {'St2': 1.0},
    ('St2', 'ToSt1'): {'St1': 1.0},
    ('St2', 'ToSt3'): {'St3': 1.0},
    ('St3', 'ToEnd'): {'End': 1.0},
    ('St3', 'ToSt4'): {'St4': 1.0},
    ('St4', 'ToEnd'): {'End': 1.0},
    ('St4', 'ToPub'): {'St2': 0.2, 'St3': 0.4, 'St4': 0.4} 
}

gamma = 0.9

# Initialize Q
Q = {}
for state in states:
    for action in actions.get(state, []):
        Q[(state, action)] = 0

# iterate and updata Q
def update_q_values(Q, R, P, gamma, states, actions, max_iterations=1000, tolerance=1e-6):
    for _ in range(max_iterations):
        delta = 0
        new_Q = Q.copy()
        for state in states:
            for action in actions.get(state, []):
                if (state, action) in R:
                    expected_value = sum(
                        P[(state, action)][next_state] * 
                        (max(Q[(next_state, a)] for a in actions.get(next_state, [])) if actions.get(next_state) else 0)
                        for next_state in P[(state, action)]
                    )
                    new_Q[(state, action)] = R[(state, action)] + gamma * expected_value
                    delta = max(delta, abs(new_Q[(state, action)] - Q[(state, action)]))
        Q = new_Q
        if delta < tolerance:
            break
    return Q

Q = update_q_values(Q, R, P, gamma, states, actions)

#output the result
for state in states:
    for action in actions.get(state, []):
        print(f"Q({state}, {action}) = {Q[(state, action)]}")


Q(St1, ToSt1) = 2.483
Q(St1, ToSt2) = 3.87
Q(St2, ToSt1) = 2.483
Q(St2, ToSt3) = 4.3
Q(St3, ToEnd) = 0.0
Q(St3, ToSt4) = 7.0
Q(St4, ToEnd) = 10.0
Q(St4, ToPub) = 7.894
