# 10-armed Testbed

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'src')))
import numpy as np

from tqdm import trange
import matplotlib
import matplotlib.pyplot as plt

from bandit import Bandit
matplotlib.use('Agg')

In [3]:
def simulate(runs, times, bandits):
    # region Summary
    """
    For any learning method, we can measure its performance and behavior as it improves with experience over 1000 time steps
    when applied to 1 of the bandit problems. This makes up 1 run. Repeating this for 2000 independent runs, each with a different
    bandit problem, we obtained measures of the learning algorithm’s average behavior.
    :param runs: Number of runs
    :param times: Number of times
    :param bandits: Bandit problems
    :return: Optimal action count mean and reward mean
    """
    # endregion Summary

    # region Body

    # Prepare a matrix filled with 0s for rewards
    rewards = np.zeros((len(bandits), runs, times))

    # Prepare a matrix filled with 0s for optimal action counts that has the same shape as rewards matrix
    optimal_actions_counts = np.zeros(rewards.shape)

    # For every bandit
    for i , bandit in enumerate(bandits):

        # for every run
        for run in trange(runs):
            # initialize bandit
            bandit.initialize()
            # for every time step
            for time in range(times):
                # select an action
                action = bandit.act()


                # get the reward
                rewards[i, run, time] = bandit.step(action)

                # if the selected action is optimal for bandit
                if action == bandit.optimal_action:
                    # change the corresponding 0 in the optimal action counts matrix to 1
                    optimal_actions_counts[i, run, time] = 1

    return optimal_actions_counts.mean(axis=1), rewards.mean(axis=1)

    # endregion Body

## 1. Reward Distribution

In [7]:
# Plot an example reward distribution
plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
plt.title("Figure 2.1")
plt.xlabel("Action")
plt.ylabel("Reward distribution")
plt.savefig("../generated_images/figure_2_1.png")
plt.close()

## 2. Greedy Action Selection VS ε-greedy Action Selection

In [8]:
# Create a list of epsilons with 0, 0.1 and 0.01 values
epsilons = [0, 0.1, 0.01]


# Create a list of bandits (1 bandit for every epsilon) where every bandit uses sample-average method
bandits = [Bandit(epsilon=epsilon, use_sample_averages=True) for epsilon in epsilons]


In [9]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts and rewards
optimal_actions_counts, rewards_mean = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [01:02<00:00, 32.23it/s]
100%|██████████| 2000/2000 [01:04<00:00, 31.18it/s]
100%|██████████| 2000/2000 [01:15<00:00, 26.47it/s]


In [10]:
# Plotting
plt.figure(figsize = (10, 20))

<Figure size 1000x2000 with 0 Axes>

In [11]:
plt.subplot(2, 1, 1)
for epsilon, rewards in zip(epsilons, rewards_mean):
    plt.plot(rewards, label=r"$\epsilon = %.02f$" % epsilon)
plt.title("Figure 2.2")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.legend()

<matplotlib.legend.Legend at 0x11918d3db20>

In [12]:
plt.subplot(2, 1, 2)
for epsilon, counts in zip(epsilons, optimal_actions_counts):
    plt.plot(counts, label=r"$\epsilon = %.02f$" % epsilon)
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

<matplotlib.legend.Legend at 0x119196a8770>

In [13]:
plt.savefig("../generated_images/figure_2_2.png")
plt.close()

## 3. Optimistic Initial Values VS Realistic Initial Values

In [14]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑄_1(𝑎) = 5, 𝛼 = 0.1,
# 2. 2nd bandit: ε = 0.1, 𝑄_1(𝑎) = 0, 𝛼 = 0.1
bandits = [Bandit(epsilon = 0, initial_action_value_estimates = 5, step_size = 0.1),
           Bandit(epsilon = 0.1, initial_action_value_estimates = 0, step_size = 0.1)]

In [15]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts
optimal_actions_counts, _ = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [00:53<00:00, 37.10it/s]
100%|██████████| 2000/2000 [00:36<00:00, 54.89it/s]


In [16]:
# Plotting
plt.plot(optimal_actions_counts[0], label = "$epsilons = 0 , Q1 = 5$")
plt.plot(optimal_actions_counts[1], label = "$epsilons = 0.1 , Q1 = 0$")
plt.title("Figure 2.3")
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

plt.savefig("../generated_images/figure_2_3.png")
plt.close()

## 4. Upper-Confidence-Bound (UCB) Action Selection

In [17]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑐 = 2, uses sample-average method,
# 2. 2nd bandit: ε = 0.1, uses sample-average method
bandits = [Bandit(epsilon = 0, confidence_level = 2, use_sample_averages = True),
           Bandit(epsilon = 0.1, use_sample_averages = True)]

In [18]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate average rewards
_,average_rewards = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [00:59<00:00, 33.77it/s]
100%|██████████| 2000/2000 [00:47<00:00, 42.33it/s]


In [None]:
# Plotting
plt.plot(average_rewards[0], label = "UCB $c = 2$")
plt.plot(average_rewards[1], label = r"$\epsilon-greedy$ $\epsilon = 0.1$")
plt.title("Figure 2.4")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.legend()

plt.savefig("../generated_images/figure_2_4.png")
plt.close()

## 5. Gradient Bandit Algorithms (GBA)

In [None]:
# Create a list of 4 bandits where:
# 1. 1st bandit: uses GBA, 𝛼 = 0.1, uses average reward as baseline for GBA, expects true reward of 4,
# 2. 2nd bandit: uses GBA, 𝛼 = 0.1, doesn't use average reward as baseline for GBA, expects true reward of 4,
# 3. 3rd bandit: uses GBA, 𝛼 = 0.4, uses average reward as baseline for GBA, expects true reward of 4,
# 4. 4th bandit: uses GBA, 𝛼 = 0.4, doesn't use average reward as baseline for GBA, expects true reward of 4
bandits = [Bandit(use_gradient = True, step_size = 0.1, use_gradient_baseline = True, true_expected_reward = 4),
           Bandit(use_gradient = True, step_size = 0.1, use_gradient_baseline = False, true_expected_reward = 4),
           Bandit(use_gradient = True, step_size = 0.4, use_gradient_baseline = True, true_expected_reward = 4),
           Bandit(use_gradient = True, step_size  = 0.4, use_gradient_baseline = False, true_expected_reward = 4)]

In [None]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts
optimal_actions_counts, _ = simulate(runs, times, bandits)

In [None]:
# Labels
labels = [r"$\alpha = 0.1$, with baseline ", r"$\alpha = 0.1$, without baseline",
          r"$\alpha = 0.4$, with baseline", r"$\alpha = 0.4$, without baseline"]

In [None]:
# Plotting
for i in range(len(bandits)):
    plt.plot(optimal_actions_counts[i], label = labels[i])

In [None]:
plt.title("Figure 2.5")
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

plt.savefig("../generated_images/figure_2.5.png")
plt.close()

## 6. Comparison of Bandit Algorithms with Different Parameters



In [19]:
# Define labels for each method
labels = ['epsilon-greedy', 'gradient bandit',
          'UCB', 'optimistic initialization']


In [34]:
# Define bandit generators with respective parameters
generators = [lambda epsilon: Bandit(epsilon=epsilon, use_sample_averages=True),
              lambda alpha: Bandit(use_gradient=True, step_size=alpha, use_gradient_baseline=True),
              lambda coef: Bandit(epsilon=0, confidence_level=coef, use_sample_averages=True),
              lambda initial: Bandit(epsilon=0, initial_action_value_estimates=initial, step_size=0.1)]


# Define parameter ranges as powers of 2
parameters = [np.arange(-7, -1, dtype=float),
              np.arange(-5, 2, dtype=float),
              np.arange(-4, 3, dtype=float),
              np.arange(-2, 3, dtype=float)]


In [35]:
# Create bandits for each method and parameter
bandits = []
for generator, parameter in zip(generators, parameters):
    for param in parameter:
        bandits.append(generator(pow(2, param)))

In [36]:
_, average_rewards = simulate(runs=2000, times=1000, bandits=bandits)
rewards = np.mean(average_rewards, axis=1)

print(rewards.shape)


# Plotting
i = 0
for label, parameter in zip(labels, parameters):
    l = len(parameter)
    if len(rewards[i:i + l]) == l:
        plt.plot(parameter, rewards[i:i + l], label=label)
    else:
        print(f"Warning: Mismatch in length for {label}.")

    i += l

plt.xlabel('Parameter ($2^x$)')
plt.ylabel('Average reward')
plt.legend()

# Save the figure
plt.savefig("../generated_images/figure_2_6.png")
plt.close()

100%|██████████| 2000/2000 [01:03<00:00, 31.61it/s]
100%|██████████| 2000/2000 [00:58<00:00, 33.91it/s]
100%|██████████| 2000/2000 [00:57<00:00, 34.79it/s]
100%|██████████| 2000/2000 [00:53<00:00, 37.28it/s]
100%|██████████| 2000/2000 [00:51<00:00, 38.58it/s]
100%|██████████| 2000/2000 [00:59<00:00, 33.53it/s]
100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]
100%|██████████| 2000/2000 [01:10<00:00, 28.21it/s]
100%|██████████| 2000/2000 [00:57<00:00, 34.95it/s]
100%|██████████| 2000/2000 [01:03<00:00, 31.27it/s]
100%|██████████| 2000/2000 [01:00<00:00, 33.24it/s]
100%|██████████| 2000/2000 [01:03<00:00, 31.34it/s]
100%|██████████| 2000/2000 [00:59<00:00, 33.71it/s]
100%|██████████| 2000/2000 [00:56<00:00, 35.56it/s]
100%|██████████| 2000/2000 [01:23<00:00, 23.91it/s]
100%|██████████| 2000/2000 [01:22<00:00, 24.24it/s]
100%|██████████| 2000/2000 [01:18<00:00, 25.50it/s]
100%|██████████| 2000/2000 [01:01<00:00, 32.62it/s]
100%|██████████| 2000/2000 [01:21<00:00, 24.60it/s]
100%|███████

(25,)
