# Multi-armed Gradient Bandits: Stochastic

In [1]:
import numpy as np

## Create environment

In [2]:
def create_environment_num_bandits():
    """Creates environment number of bandits.

    Returns:
        num_bandits: int, number of bandits.
    """
    num_bandits = 10

    return num_bandits

In [3]:
def create_environment_bandit_means(num_bandits):
    """Creates environment bandit means.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        bandit_mean: array[float], the means of each bandit.
    """
    global_bandit_mean_mean = 0.0
    global_bandit_mean_variance = 1.0

    bandit_mean = np.random.normal(
        loc=global_bandit_mean_mean,
        scale=np.sqrt(global_bandit_mean_variance),
        size=num_bandits)

    return global_bandit_mean_mean, global_bandit_mean_variance, bandit_mean

In [4]:
def create_environment_bandit_variances(num_bandits):
    """Creates environment bandit variances.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        bandit_variance: array[float], the variances of each bandit.
    """
    global_bandit_variance_mean = 1.0
    global_bandit_variance_variance = 0.0

    bandit_variance = np.random.normal(
        loc=global_bandit_variance_mean,
        scale=np.sqrt(global_bandit_variance_variance),
        size=num_bandits)

    return (global_bandit_variance_mean,
            global_bandit_variance_variance,
            bandit_variance)

In [5]:
def create_environment_bandit_change_arrays(num_bandits):
    """Creates environment bandit change arrays.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
    """
    bandit_change_frequencies = np.repeat(
        a=201, repeats=num_bandits)

    bandit_change_counter = np.zeros(
        shape=[num_bandits], dtype=np.int64)

    return (bandit_change_frequencies,
            bandit_change_counter)

In [6]:
def create_environment():
    """Creates environment.

    Returns:
        num_bandits: int, number of bandits.
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        bandit_mean: array[float], the means of each bandit.
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        bandit_variance: array[float], the variances of each bandit.
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
    """
    num_bandits = create_environment_num_bandits()

    (global_bandit_mean_mean,
     global_bandit_mean_variance,
     bandit_mean) = create_environment_bandit_means(num_bandits)

    (global_bandit_variance_mean,
     global_bandit_variance_variance,
     bandit_variance) = create_environment_bandit_variances(num_bandits)

    (bandit_change_frequencies,
     bandit_change_counter) = create_environment_bandit_change_arrays(
        num_bandits)

    return (num_bandits,
            global_bandit_mean_mean,
            global_bandit_mean_variance,
            bandit_mean,
            global_bandit_variance_mean,
            global_bandit_variance_variance,
            bandit_variance,
            bandit_change_frequencies,
            bandit_change_counter)

## Set hyperparameters

In [7]:
def set_hyperparameters():
    """Sets hyperparameters.

    Returns:
        num_iterations: int, number of iterations.
        alpha: float, alpha > 0, learning rate.
        average_reward_update_type: int, average reward update type (
            sample-average, constant step-size).
    """
    num_iterations = 2000
    alpha = 0.1
    average_reward_update_type = 1

    return (num_iterations,
            alpha,
            average_reward_update_type)

## Create value function and policy arrays

In [8]:
def create_action_arrays(num_bandits):
    """Creates action arrays.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        action_preference: array[float], keeps track of the preference
            of each bandit.
    """
    action_preference = np.zeros(shape=[num_bandits], dtype=np.float64)

    return action_preference

In [9]:
def create_policy_arrays(num_bandits):
    """Creates policy arrays.

    Args:
        num_bandits: int, number of bandits.
    Returns:
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    policy = np.repeat(a=1.0 / num_bandits, repeats=num_bandits)

    return policy

## Create algorithm

In [10]:
# Set random seed so that everything is reproducible
np.random.seed(seed=0)

In [11]:
def loop_through_iterations(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_preference,
        policy,
        alpha,
        average_reward_update_type):
    """Loops through iterations to iteratively update policy.

    Args:
        num_iterations: int, number of iterations.
        num_bandits: int, number of bandits.
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        action_preference: array[float], keeps track of the preference
            of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
        alpha: float, alpha > 0, learning rate.
        average_reward_update_type: int, average reward update type (
            sample-average, constant step-size).
    Returns:
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        action_preference: array[float], keeps track of the preference
            of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    average_reward = 0.0
    # Loop through iterations until termination
    for t in range(0, num_iterations):
        # Choose policy by epsilon-greedy choosing from action-value function
        policy = update_policy_from_action_preference(
            num_bandits, action_preference, policy)

        # Get action
        a_idx = np.random.choice(a=num_bandits, p=policy)

        # Get reward from action
        reward = np.random.normal(
            loc=bandit_mean[a_idx], scale=np.sqrt(bandit_variance[a_idx]))

        # Update average reward
        if average_reward_update_type == 0:  # sample-average method
            average_reward += 1.0 / (t + 1) * (reward - average_reward)
        elif average_reward_update_type == 1:  # constant step-size
            average_reward += alpha * (reward - average_reward)

        # Update action preference
        reward_diff = reward - average_reward
        action_preference = np.where(
            np.arange(num_bandits) == a_idx,
            action_preference + alpha * reward_diff * (1.0 - policy),
            action_preference - alpha * reward_diff * policy)

        # Mutate bandit statistics
        for i in range(num_bandits):
            if bandit_change_frequencies[i] > 0:
                bandit_change_counter[i] += 1

                if bandit_change_counter[i] == bandit_change_frequencies[i]:
                    bandit_mean[i] = np.random.normal(
                        loc=global_bandit_mean_mean,
                        scale=np.sqrt(global_bandit_mean_variance))
                    bandit_variance[i] = np.random.normal(
                        loc=global_bandit_variance_mean,
                        scale=np.sqrt(global_bandit_variance_variance))

                    bandit_change_counter[i] = 0

    return bandit_mean, bandit_variance, action_preference, policy

In [12]:
def update_policy_from_action_preference(
        num_bandits, action_preference, policy):
    """Updates policy based on action preference.

    Args:
        num_bandits: int, number of bandits.
        action_preference: array[float], keeps track of the preference
            of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
    Returns:
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    # Calculate probabilities by taking softmax of action preferences
    policy = apply_softmax_function(num_bandits, action_preference, policy)

    return policy

In [13]:
def apply_softmax_function(num_bandits, action_preference, policy):
    """Applies the softmax function to action preferences to update policy.

    Args:
        num_bandits: int, number of bandits.
        action_preference: array[float], keeps track of the preference
            of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
    Returns:
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    # f(xi) = e^(xi - max(x)) / sum(e^(xj - max(x)), j, 0, n - 1)

    max_logit = np.max(a=action_preference)

    # Shift logits by the max logit to make numerically stable
    policy = np.exp(action_preference - max_logit)
    policy /= np.sum(a=policy)

    return policy

In [14]:
def stochastic_multi_armed_gradient_bandits(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_preference,
        policy,
        alpha,
        average_reward_update_type):
    """Loops through iterations to iteratively update policy.

    Args:
        num_iterations: int, number of iterations.
        num_bandits: int, number of bandits.
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        bandit_change_frequencies: array[int], how often each
            bandit's statistics changes.
        bandit_change_counter: array[int], the change
            counter of each bandit.
        global_bandit_mean_mean: float, the global mean of means across all
            bandits.
        global_bandit_mean_variance: float, the global variance of means
            across all bandits.
        global_bandit_variance_mean: float, the global variance of variances
            across all bandits.
        global_bandit_variance_variance: float, the global variance of variances
            across all bandits.
        action_preference: array[float], keeps track of the preference
            of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
        alpha: float, alpha > 0, learning rate.
        average_reward_update_type: int, average reward update type (
            sample-average, constant step-size).
    Returns:
        bandit_mean: array[float], the means of each bandit.
        bandit_variance: array[float], the variances of each bandit.
        action_preference: array[float], keeps track of the preference
            of each bandit.
        policy: array[float], learned stochastic policy of which
            bandit to action.
    """
    # Loop through iterations and update the policy
    (bandit_mean,
     bandit_variance,
     action_preference,
     policy) = loop_through_iterations(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_preference,
        policy,
        alpha,
        average_reward_update_type)

    return bandit_mean, bandit_variance, action_preference, policy

## Run algorithm

In [15]:
def run_algorithm():
    """Runs the algorithm."""
    (num_bandits,
     global_bandit_mean_mean,
     global_bandit_mean_variance,
     bandit_mean,
     global_bandit_variance_mean,
     global_bandit_variance_variance,
     bandit_variance,
     bandit_change_frequencies,
     bandit_change_counter) = create_environment()

    num_iterations, alpha, average_reward_update_type = set_hyperparameters()

    action_preference = create_action_arrays(num_bandits)

    policy = create_policy_arrays(num_bandits)

    # Print initial arrays
    print("\nInitial bandit mean")
    print(bandit_mean)

    print("\nInitial bandit variance")
    print(bandit_variance)

    print("\nInitial action preference function")
    print(action_preference)

    print("\nInitial policy")
    print(policy)

    # Run on policy temporal difference sarsa
    (bandit_mean,
     bandit_variance,
     action_preference,
     policy) = stochastic_multi_armed_gradient_bandits(
        num_iterations,
        num_bandits,
        bandit_mean,
        bandit_variance,
        bandit_change_frequencies,
        bandit_change_counter,
        global_bandit_mean_mean,
        global_bandit_mean_variance,
        global_bandit_variance_mean,
        global_bandit_variance_variance,
        action_preference,
        policy,
        alpha,
        average_reward_update_type)

    # Print final results
    print("\nFinal bandit mean")
    print(bandit_mean)

    print("\nFinal bandit variance")
    print(bandit_variance)

    print("\nFinal action preference function")
    print(action_preference)

    print("\nFinal policy")
    print(policy)

In [16]:
run_algorithm()


Initial bandit mean
[ 1.76405235  0.40015721  0.97873798  2.2408932   1.86755799 -0.97727788
  0.95008842 -0.15135721 -0.10321885  0.4105985 ]

Initial bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Initial action preference function
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Initial policy
[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]

Final bandit mean
[-1.07668153 -0.81885676 -0.64743395 -0.23869974  0.23653173  0.01623212
  1.25593101 -0.67111337  0.92955301  0.31983224]

Final bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Final action preference function
[-0.67786663 -1.22810691  0.54660341  4.8841117  -1.57988109 -0.29028634
 -0.82674735 -0.50718411 -0.83868299  0.51804032]

Final policy
[0.00365876 0.00211039 0.01244912 0.95220016 0.00148453 0.00539092
 0.00315264 0.00433972 0.00311523 0.01209854]
