In [None]:
# Pseudo code for Value Iteration
def value_iteration(states, actions, transition_probabilities, rewards, discount_factor, epsilon):
    V = {state: 0 for state in states}

    while True:
        delta = 0
        for state in states:
            v = V[state]
            V[state] = max(sum(transition_probabilities[state][action][next_state] * (rewards[state][action][next_state] + discount_factor * V[next_state]) for next_state in states) for action in actions)
            delta = max(delta, abs(v - V[state]))

        if delta < epsilon:
            break

    return V

# Pseudo code for Policy Iteration
def policy_iteration(states, actions, transition_probabilities, rewards, discount_factor, epsilon):
    policy = {state: actions[0] for state in states}

    while True:
        V = policy_evaluation(policy, states, actions, transition_probabilities, rewards, discount_factor, epsilon)
        policy_stable = True

        for state in states:
            old_action = policy[state]
            policy[state] = max(actions, key=lambda a: sum(transition_probabilities[state][a][next_state] * (rewards[state][a][next_state] + discount_factor * V[next_state]) for next_state in states))

            if old_action != policy[state]:
                policy_stable = False

        if policy_stable:
            break

    return policy

def policy_evaluation(policy, states, actions, transition_probabilities, rewards, discount_factor, epsilon):
    V = {state: 0 for state in states}

    while True:
        delta = 0
        for state in states:
            v = V[state]
            action = policy[state]
            V[state] = sum(transition_probabilities[state][action][next_state] * (rewards[state][action][next_state] + discount_factor * V[next_state]) for next_state in states)
            delta = max(delta, abs(v - V[state]))

        if delta < epsilon:
            break

    return V

# Example usage
states = [1, 2, 3]
actions = ['left', 'right']
transition_probabilities = {
    1: {'left': {1: 0.8, 2: 0.2}, 'right': {1: 0.2, 2: 0.8}},
    2: {'left': {1: 0.2, 2: 0.8, 3: 0.1}, 'right': {1: 0.1, 2: 0.1, 3: 0.8}},
    3: {'left': {2: 0.1, 3: 0.9}, 'right': {2: 0.9, 3: 0.1}}
}
rewards = {
    1: {'left': {1: -1, 2: 0}, 'right': {1: 0, 2: -1}},
    2: {'left': {1: -1, 2: 0, 3: 1}, 'right': {1: 0, 2: -1, 3: -1}},
    3: {'left': {1: 0, 2: 0, 3: 0}, 'right': {1: 0, 2: 0, 3: 0}}
}
discount_factor = 0.9
epsilon = 0.01

# Ensure all state-action pairs have associated transition probabilities and rewards
for state in states:
    for action in actions:
        for next_state in states:
            if next_state not in transition_probabilities[state][action]:
                transition_probabilities[state][action][next_state] = 0.0
            if next_state not in rewards[state][action]:
                rewards[state][action][next_state] = 0.0

optimal_value_function = value_iteration(states, actions, transition_probabilities, rewards, discount_factor, epsilon)
print("Optimal Value Function:", optimal_value_function)

optimal_policy = policy_iteration(states, actions, transition_probabilities, rewards, discount_factor, epsilon)
print("Optimal Policy:", optimal_policy)


Optimal Value Function: {1: -2.6720074413196286, 2: -1.9432847688029105, 3: -0.8822035319541259}
Optimal Policy: {1: 'right', 2: 'right', 3: 'left'}
