In [6]:
import numpy as np

In [7]:
def incremental_uniform(true_means, total_steps):
    n_arms = len(true_means)
    counts = np.zeros(n_arms)
    averages = np.zeros(n_arms)

    for t in range(total_steps):
        arm = t % n_arms
        reward = np.random.normal(true_means[arm], 1.0)
        counts[arm] += 1
        averages[arm] += (reward - averages[arm]) / counts[arm]
        
    return counts, averages

In [8]:
def epsilon_greedy(true_means, total_steps, epsilon=0.1):
    n_arms = len(true_means)
    counts = np.zeros(n_arms)
    averages = np.zeros(n_arms)

    for t in range(total_steps):
        if np.random.random() < epsilon:
            arm = np.random.randint(n_arms)
        else:
            arm = np.argmax(averages)

        reward = np.random.normal(true_means[arm], 1.0)
        counts[arm] += 1
        averages[arm] += (reward - averages[arm]) / counts[arm]
        
    return counts, averages

In [9]:
def ucb1(true_means, total_steps, c=2.0):
    n_arms = len(true_means)
    counts = np.zeros(n_arms)
    averages = np.zeros(n_arms)

    for arm in range(n_arms):
        reward = np.random.normal(true_means[arm], 1.0)
        counts[arm] += 1
        averages[arm] = reward

    for t in range(n_arms, total_steps):
        exploration_term = np.sqrt((2 * np.log(t)) / counts)
        ucb_values = averages + c * exploration_term
        
        arm = np.argmax(ucb_values)
        reward = np.random.normal(true_means[arm], 1.0)

        counts[arm] += 1
        averages[arm] += (reward - averages[arm]) / counts[arm]
        
    return counts, averages

In [10]:
def print_results(name, true_means, counts, averages):
    is_correct = np.argmax(averages) == np.argmax(true_means)
    
    print(f"\n--- {name} ---")
    print(f"Pulls:    {counts.astype(int)}")
    print(f"Expected: {np.round(averages, 2)}")
    print(f"Choice:   Arm {np.argmax(averages)} ({'Correct' if is_correct else 'Wrong'})")

In [11]:
means = [1.5, 2.8, 5.2, 3.1, 0.9]
steps = 1000

cnt, avg = incremental_uniform(means, steps)
print_results("Round Robin", means, cnt, avg)

cnt, avg = epsilon_greedy(means, steps, epsilon=0.1)
print_results("Epsilon-Greedy", means, cnt, avg)

cnt, avg = ucb1(means, steps, c=2.0)
print_results("UCB1", means, cnt, avg)


--- Round Robin ---
Pulls:    [200 200 200 200 200]
Expected: [1.55 2.76 5.1  3.19 0.97]
Choice:   Arm 2 (Correct)

--- Epsilon-Greedy ---
Pulls:    [ 20  48 871  44  17]
Expected: [1.34 2.76 5.21 2.85 1.25]
Choice:   Arm 2 (Correct)

--- UCB1 ---
Pulls:    [  4   7 977   8   4]
Expected: [1.29 2.52 5.18 2.71 1.06]
Choice:   Arm 2 (Correct)
