In [None]:
import numpy as np

In [None]:
capital = np.zeros(101, dtype=np.float64)
policy = np.ones(101, dtype=np.int32)

p_coin_head = 0.55

In [None]:
# 2.
# Value Iteration
# Loop:
#   Δ ← 0
#   Loop for each s ∈ S:
#     v ← V(s)
#     V(s) ← max_a ∑_{s´,r} p(s´, r | s, a)[r + γV(s´)]
#     Δ ← max(Δ, |v - V(s)|)
# until Δ < θ (a small positive number determining the accuracy of estimation)

def compute_state_value(state: int, p_coin_head: float) -> float:
    
    global capital
    
    gamma = 1
    
    if state in (0, 100):
        return capital[state]

    state_value_computed = np.float64('-inf')

    for action in range(1, min(state, 100 - state) + 1):
        state_if_coin_head = state + action
        reward_if_coin_head = 1 if state_if_coin_head == 100 else 0
        state_if_coin_tail = state - action
        reward_if_coin_tail = 1 if state_if_coin_tail == 100 else 0

        state_value = p_coin_head * (reward_if_coin_head + gamma * capital[state_if_coin_head]) \
            + (1 - p_coin_head) * (reward_if_coin_tail + gamma * capital[state_if_coin_tail])
        
        state_value_computed = state_value if state_value > state_value_computed else state_value_computed
            

    return state_value_computed
    

def run_value_iteration(p_coin_head):
    global capital

    theta = 1e-9
    iteration = 1

    while True:
        delta = 0.0

        for state, value in enumerate(capital):
            v = value
            capital[state] = compute_state_value(state, p_coin_head)
            delta = max(delta, abs(v - capital[state]))
        
        print(f"After iteration {iteration} - delta = {delta}")

        if delta < theta:
            print(f"Converged after iteration {iteration}")
            break

        iteration += 1

In [None]:
def output_deterministic_policy():
    global capital
    global policy
    
    gamma = 1
    epsilon = 1e-9

    for state, _ in enumerate(capital):
        state_value_computed = np.float64('-inf')
        optimal_action = 0

        for action in range(1, min(state, 100 - state) + 1):
            state_if_coin_head = state + action
            reward_if_coin_head = 1 if state_if_coin_head == 100 else 0
            state_if_coin_tail = state - action
            reward_if_coin_tail = 1 if state_if_coin_tail == 100 else 0

            state_value = p_coin_head * (reward_if_coin_head + gamma * capital[state_if_coin_head]) \
                + (1 - p_coin_head) * (reward_if_coin_tail + gamma * capital[state_if_coin_tail])
            
            if state_value > state_value_computed + epsilon:
                state_value_computed = state_value
                optimal_action = action

        policy[state] = optimal_action


In [None]:
import matplotlib.pyplot as plt

def plot_value_function(capital, p_coin_head):
    """Plot the value function showing probability of winning vs capital"""
    states = np.arange(0, 101)
    
    plt.figure(figsize=(10, 6))
    plt.plot(states, capital, linewidth=2)
    plt.xlabel('Capital', fontsize=12)
    plt.ylabel('Value estimates', fontsize=12)
    plt.title(f"Value Function for Gambler's Problem (p_h = {p_coin_head})", fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.xlim(0, 100)
    plt.ylim(0, 1)
    
    # Add annotations for key points
    plt.axhline(y=1.0, color='r', linestyle='--', alpha=0.3, label='Final value function')
    
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_policy(policy, p_coin_head):
    """Plot the final policy showing stake vs capital"""
    states = np.arange(0, 101)
    
    plt.figure(figsize=(10, 6))
    plt.bar(states, policy, width=1.0, edgecolor='black', linewidth=0.5)
    plt.xlabel('Capital', fontsize=12)
    plt.ylabel('Final policy (stake)', fontsize=12)
    plt.title(f"Final Policy for Gambler's Problem (p_h = {p_coin_head})", fontsize=14)
    plt.xlim(0, 100)
    plt.xticks([1, 25, 50, 75, 99])
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()

In [None]:
run_value_iteration(p_coin_head)
output_deterministic_policy()

capital[-1] = 1.0
plot_value_function(capital, p_coin_head)
plot_policy(policy, p_coin_head)