# Multi-armed Bandits: Stochastic

In [1]:
import numpy as np

## Create environment

In [2]:
def create_environment_num_bandits():
    """Creates environment number of bandits.

    Returns:
        num_bandits: int, number of bandits.
    """
    num_bandits = 10

    return num_bandits

In [3]:
def create_environment_bandit_means(num_bandits):
    """Creates environment bandit means.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        bandit_mean: array[float], the means of each bandit.
    """
    global_bandit_mean_mean = 0.0
    global_bandit_mean_variance = 1.0

    bandit_mean = np.random.normal(
        loc=global_bandit_mean_mean,
        scale=np.sqrt(global_bandit_mean_variance),
        size=num_bandits)

    return global_bandit_mean_mean, global_bandit_mean_variance, bandit_mean

In [4]:
def create_environment_bandit_variances(num_bandits):
    """Creates environment bandit variances.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        bandit_variance: array[float], the variances of each bandit.
    """
    global_bandit_variance_mean = 1.0
    global_bandit_variance_variance = 0.0

    bandit_variance = np.random.normal(
        loc=global_bandit_variance_mean,
        scale=np.sqrt(global_bandit_variance_variance),
        size=num_bandits)

    return (global_bandit_variance_mean,
            global_bandit_variance_variance,
            bandit_variance)

In [5]:
def create_environment_bandit_change_arrays(num_bandits):
    """Creates environment bandit change arrays.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
    """
    bandit_change_frequencies = np.repeat(
        a=201, repeats=num_bandits)

    bandit_change_counter = np.zeros(
        shape=[num_bandits], dtype=np.int64)

    return (bandit_change_frequencies,
            bandit_change_counter)

In [6]:
def create_environment():
    """Creates environment.

    Returns:
        num_bandits: int, number of bandits.
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        bandit_mean: array[float], the means of each bandit.
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        bandit_variance: array[float], the variances of each bandit.
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
    """
    num_bandits = create_environment_num_bandits()

    (global_bandit_mean_mean,
     global_bandit_mean_variance,
     bandit_mean) = create_environment_bandit_means(num_bandits)

    (global_bandit_variance_mean,
     global_bandit_variance_variance,
     bandit_variance) = create_environment_bandit_variances(num_bandits)

    (bandit_change_frequencies,
     bandit_change_counter) = create_environment_bandit_change_arrays(
        num_bandits)

    return (num_bandits,
            global_bandit_mean_mean,
            global_bandit_mean_variance,
            bandit_mean,
            global_bandit_variance_mean,
            global_bandit_variance_variance,
            bandit_variance,
            bandit_change_frequencies,
            bandit_change_counter)

## Set hyperparameters

In [7]:
def set_hyperparameters():
    """Sets hyperparameters.

    Returns:
        num_iterations: int, number of iterations.
        alpha: float, alpha > 0, learning rate.
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        action_selection_type: int, action selection type (greedy,
            epsilon-greedy, upper-confidence-bound).
        action_value_update_type: int, action value update type (
            sample-average, biased constant step-size, unbiased constant
            step-size).
    """
    num_iterations = 2000
    alpha = 0.1
    epsilon = 0.1
    action_selection_type = 1
    action_value_update_type = 2

    return (num_iterations,
            alpha,
            epsilon,
            action_selection_type,
            action_value_update_type)

## Create value function and policy arrays

In [8]:
def create_action_arrays(num_bandits):
    """Creates action arrays.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        action_value_function: array[float], keeps track of the estimated
            value of each bandit.
        action_count: array[int], counts the number of times each bandit was
            actioned.
        action_trace: array[float], keeps track of the reward trace for each
            bandit.
    """
    action_value_function = np.zeros(shape=[num_bandits], dtype=np.float64)

    action_count = np.zeros(shape=[num_bandits], dtype=np.int64)

    action_trace = np.zeros(shape=[num_bandits], dtype=np.float64)

    return action_value_function, action_count, action_trace

In [9]:
def create_policy_arrays(num_bandits):
    """Creates policy arrays.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    policy = np.repeat(a=1.0 / num_bandits, repeats=num_bandits)

    return policy

## Create algorithm

In [10]:
# Set random seed so that everything is reproducible
np.random.seed(seed=0)

In [11]:
def loop_through_iterations(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_value_function,
        action_count,
        action_trace,
        policy,
        alpha,
        epsilon,
        action_selection_type,
        action_value_update_type):
    """Loops through iterations to iteratively update policy.

    Args:
        num_iterations: int, number of iterations.
        num_bandits: int, number of bandits.
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        action_value_function: array[float], keeps track of the estimated
            value of each bandit.
        action_count: array[int], counts the number of times each bandit was
            actioned.
        action_trace: array[float], keeps track of the reward trace for each
            bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
        alpha: float, alpha > 0, learning rate.
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        action_selection_type: int, action selection type (greedy,
            epsilon-greedy, upper-confidence-bound).
        action_value_update_type: int, action value update type (
            sample-average, biased constant step-size, unbiased constant
            step-size).
    Returns:
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        action_value_function: array[float], keeps track of the estimated
            value of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    # Loop through iterations until termination
    for t in range(0, num_iterations):
        # Choose policy by epsilon-greedy choosing from action-value function
        policy = update_policy_from_action_value_function(
            num_bandits,
            action_value_function,
            action_count,
            t + 1,
            epsilon,
            action_selection_type,
            policy)

        # Get action
        a_idx = np.random.choice(a=num_bandits, p=policy)

        # Get reward from action
        reward = np.random.normal(
            loc=bandit_mean[a_idx],
            scale=np.sqrt(bandit_variance[a_idx]))

        # Update action count
        action_count[a_idx] += 1

        # Update action-value function
        if action_value_update_type == 0:  # sample-average method
            learning_rate = 1.0 / action_count[a_idx]
            delta = reward - action_value_function[a_idx]
            action_value_function[a_idx] += learning_rate * delta
        elif action_value_update_type == 1:  # biased constant step-size
            delta = reward - action_value_function[a_idx]
            action_value_function[a_idx] += alpha * delta
        elif action_value_update_type == 2:  # unbiased constant step-size
            # Update action trace
            update = 1.0 - action_trace[a_idx]
            action_trace[a_idx] += alpha * update
            learning_rate = alpha / action_trace[a_idx]
            delta = reward - action_value_function[a_idx]
            action_value_function[a_idx] += learning_rate * delta

        # Mutate bandit statistics
        for i in range(num_bandits):
            if bandit_change_frequencies[i] > 0:
                bandit_change_counter[i] += 1

                if bandit_change_counter[i] == bandit_change_frequencies[i]:
                    bandit_mean[i] = np.random.normal(
                        loc=global_bandit_mean_mean,
                        scale=np.sqrt(global_bandit_mean_variance))
                    bandit_variance[i] = np.random.normal(
                        loc=global_bandit_variance_mean,
                        scale=np.sqrt(global_bandit_variance_variance))

                    bandit_change_counter[i] = 0

    return bandit_mean, bandit_variance, action_value_function, policy

In [12]:
def update_policy_from_action_value_function(
        num_bandits,
        action_value_function,
        action_count,
        iteration_count,
        epsilon,
        action_selection_type,
        policy):
    """Updates policy as some function of action-value function.

    Args:
        num_bandits: int, number of bandits.
        action_value_function: array[float], keeps track of the estimated
            value of each bandit.
        action_count: array[int], counts the number of times each bandit was
            actioned.
        iteration_count: int, current loop iteration count.
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        action_selection_type: int, action selection type (greedy,
            epsilon-greedy, upper-confidence-bound).
        policy: array[float], learned stochastic policy of which
            bandit to action.
    Returns:
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    # Calculate action value depending on action selection type
    if action_selection_type == 0 or action_selection_type == 1:
        # Greedy or epsilon-greedy
        action_value = action_value_function[:]
    elif action_selection_type == 2:
        # Upper-confidence-bound
        min_count_idx = np.argmin(a=action_value_function)
        if min_count_idx == 0:
            policy = np.where(
                np.arange(num_bandits) == min_count_idx, 1.0, 0.0)
            return policy
        else:
            action_value = action_value_function + epsilon * np.sqrt(
                np.log(iteration_count) / action_count)

    # Save max action value and find the number of actions that have the same
    # max action value
    max_action_value = np.max(a=action_value_function)
    max_action_count = np.count_nonzero(
        a=action_value_function == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs
    # that have the same value and zero otherwise
    if action_selection_type == 1:
        # Epsilon-greedy
        if max_action_count == num_bandits:
            max_policy_prob_per_action = 1.0 / max_action_count
            remain_prob_per_action = 0.0
        else:
            max_policy_prob_per_action = (1.0 - epsilon) / max_action_count
            remain_prob_per_action = epsilon / (num_bandits - max_action_count)
    elif action_selection_type == 0 or action_selection_type == 2:
        # Greedy or upper-confidence-bound
        max_policy_prob_per_action = 1.0 / max_action_count
        remain_prob_per_action = 0.0

    policy = np.where(
        action_value == max_action_value,
        max_policy_prob_per_action,
        remain_prob_per_action)

    return policy

In [13]:
def stochastic_multi_armed_bandits(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_value_function,
        action_count,
        action_trace,
        policy,
        alpha,
        epsilon,
        action_selection_type,
        action_value_update_type):
    """Loops through iterations to iteratively update policy.

    Args:
        num_iterations: int, number of iterations.
        num_bandits: int, number of bandits.
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        action_value_function: array[float], keeps track of the estimated
            value of each bandit.
        action_count: array[int], counts the number of times each bandit was
            actioned.
        action_trace: array[float], keeps track of the reward trace for each
            bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
        alpha: float, alpha > 0, learning rate.
        epsilon: float, 0 <= epsilon <= 1, exploitation-exploration trade-off,
            higher means more exploration.
        action_selection_type: int, action selection type (greedy,
            epsilon-greedy, upper-confidence-bound).
        action_value_update_type: int, action value update type (
            sample-average, biased constant step-size, unbiased constant
            step-size).
    Returns:
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        action_value_function: array[float], keeps track of the estimated
            value of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    (bandit_mean,
     bandit_variance,
     action_value_function,
     policy) = loop_through_iterations(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_value_function,
        action_count,
        action_trace,
        policy,
        alpha,
        epsilon,
        action_selection_type,
        action_value_update_type)

    return bandit_mean, bandit_variance, action_value_function, policy

## Run algorithm

In [14]:
def run_algorithm():
    """Runs the algorithm."""
    (num_bandits,
     global_bandit_mean_mean,
     global_bandit_mean_variance,
     bandit_mean,
     global_bandit_variance_mean,
     global_bandit_variance_variance,
     bandit_variance,
     bandit_change_frequencies,
     bandit_change_counter) = create_environment()

    (num_iterations,
     alpha,
     epsilon,
     action_selection_type,
     action_value_update_type) = set_hyperparameters()

    (action_value_function,
     action_count,
     action_trace) = create_action_arrays(num_bandits)

    policy = create_policy_arrays(num_bandits)

    # Print initial arrays
    print("\nInitial bandit mean")
    print(bandit_mean)

    print("\nInitial bandit variance")
    print(bandit_variance)

    print("\nInitial action value function")
    print(action_value_function)

    print("\nInitial policy")
    print(policy)

    # Run on policy temporal difference sarsa
    (bandit_mean,
     bandit_variance,
     action_value_function,
     policy) = stochastic_multi_armed_bandits(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_value_function,
        action_count,
        action_trace,
        policy,
        alpha,
        epsilon,
        action_selection_type,
        action_value_update_type)

    # Print final results
    print("\nFinal bandit mean")
    print(bandit_mean)

    print("\nFinal bandit variance")
    print(bandit_variance)

    print("\nFinal action value function")
    print(action_value_function)

    print("\nFinal policy")
    print(policy)

In [15]:
run_algorithm()


Initial bandit mean
[ 1.76405235  0.40015721  0.97873798  2.2408932   1.86755799 -0.97727788
  0.95008842 -0.15135721 -0.10321885  0.4105985 ]

Initial bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Initial action value function
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Initial policy
[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]

Final bandit mean
[-1.07668153 -0.81885676 -0.64743395 -0.23869974  0.23653173  0.01623212
  1.25593101 -0.67111337  0.92955301  0.31983224]

Final bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Final action value function
[-0.67775581 -0.7097237  -0.16982074 -0.26023497 -0.28153331 -0.35250613
 -0.64989934 -0.41892227  1.1678523  -0.04797865]

Final policy
[0.01111111 0.01111111 0.01111111 0.01111111 0.01111111 0.01111111
 0.01111111 0.01111111 0.9        0.01111111]
