In [1]:
import numpy as np

# Create environment

In [2]:
number_of_bandits = 10

In [3]:
global_bandit_mean_mean = 0.0
global_bandit_mean_variance = 1.0

bandit_mean = np.random.normal(loc = global_bandit_mean_mean, scale = np.sqrt(global_bandit_mean_variance), size = number_of_bandits)
bandit_mean

array([ 0.1957272 , -1.51936052,  0.40767963, -0.45513443, -0.48142763,
       -1.24410407,  2.60391307, -1.93834966, -2.15886687, -1.15053664])

In [4]:
global_bandit_variance_mean = 1.0
global_bandit_variance_variance = 0.0

bandit_variance = np.random.normal(loc = global_bandit_variance_mean, scale = np.sqrt(global_bandit_variance_variance), size = number_of_bandits)
bandit_variance

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [5]:
bandit_stochastic_change_frequencies = np.repeat(a = 201, repeats = number_of_bandits)
bandit_stochastic_change_frequencies

array([201, 201, 201, 201, 201, 201, 201, 201, 201, 201])

In [6]:
bandit_stochastic_change_counter = np.zeros(shape = [number_of_bandits], dtype = np.int64)
bandit_stochastic_change_counter

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Set hyperparameters

In [7]:
# Set the number of iterations
number_of_iterations = 2000
# Set learning rate alpha
alpha = 0.1
# Get average reward update type (sample-average, constant step-size)
average_reward_update_type = 1

# Create value function and policy arrays

In [8]:
action_preference = np.zeros(shape = [number_of_bandits], dtype = np.float64)
action_preference

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
policy = np.repeat(a = 1.0 / number_of_bandits, repeats = number_of_bandits)
policy

array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])

# Create algorithm

In [10]:
# Set random seed so that everything is reproducible
np.random.seed(seed = 0)

In [11]:
# This function loops through iterations and updates the policy
def loop_through_iterations(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_preference, policy, alpha, average_reward_update_type):
    average_reward = 0.0
    # Loop through iterations until termination
    for t in range(0, number_of_iterations):
        # Choose policy by epsilon-greedy choosing from the action-value function
        policy = update_policy_from_action_preference(number_of_bandits, action_preference, policy)

        # Get action
        action_index = np.random.choice(a = number_of_bandits, p = policy)

        # Get reward from action
        reward = np.random.normal(loc = bandit_mean[action_index], scale = np.sqrt(bandit_variance[action_index]))

        # Update average reward
        if average_reward_update_type == 0: # sample-average method
            average_reward += 1.0 / (t + 1) * (reward - average_reward);
        elif average_reward_update_type == 1: # constant step-size
            average_reward += alpha * (reward - average_reward);
            
        # Update action preference
        action_preference = np.where(
            np.arange(number_of_bandits) == action_index, 
            action_preference + alpha * (reward - average_reward) * (1.0 - policy), 
            action_preference - alpha * (reward - average_reward) * policy)

        # Mutate bandit statistics
        for i in range(number_of_bandits):
            if bandit_stochastic_change_frequencies[i] > 0:
                bandit_stochastic_change_counter[i] += 1

                if bandit_stochastic_change_counter[i] == bandit_stochastic_change_frequencies[i]:
                    bandit_mean[i] = np.random.normal(loc = global_bandit_mean_mean, scale = np.sqrt(global_bandit_mean_variance))
                    bandit_variance[i] = np.random.normal(loc = global_bandit_variance_mean, scale = np.sqrt(global_bandit_variance_variance))

                    bandit_stochastic_change_counter[i] = 0

    return bandit_mean, bandit_variance, action_preference, policy

In [12]:
# This function updates policy based on action preference
def update_policy_from_action_preference(number_of_bandits, action_preference, policy):
    # Calculate probabilities by taking softmax of action preferences
    policy = apply_softmax_function(number_of_bandits, action_preference, policy)

    return policy

In [13]:
# This function applies the softmax function
def apply_softmax_function(number_of_bandits, action_preference, policy):
    # f(xi) = e^(xi - max(x)) / sum(e^(xj - max(x)), j, 0, n - 1)

    max_logit = np.max(a = action_preference)

    # Shift logits by the max logit to make numerically stable
    policy = np.exp(action_preference - max_logit)
    policy /= np.sum(a = policy)

    return policy

In [14]:
def stochastic_multi_armed_gradient_bandits(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_preference, policy, alpha, average_reward_update_type):
    # Loop through iterations and update the policy
    bandit_mean, bandit_variance, action_preference, policy = loop_through_iterations(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_preference, policy, alpha, average_reward_update_type)
    
    return bandit_mean, bandit_variance, action_preference, policy

# Run algorithm

In [15]:
# Print initial arrays
print("\nInitial bandit mean")
print(bandit_mean)

print("\nInitial bandit variance")
print(bandit_variance)

print("\nInitial action preference function")
print(action_preference)

print("\nInitial policy")
print(policy)

# Run on policy temporal difference sarsa
bandit_mean, bandit_variance, action_preference, policy = stochastic_multi_armed_gradient_bandits(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_preference, policy, alpha, average_reward_update_type)

# Print final results
print("\nFinal bandit mean")
print(bandit_mean)

print("\nFinal bandit variance")
print(bandit_variance)

print("\nFinal action preference function")
print(action_preference)

print("\nFinal policy")
print(policy)


Initial bandit mean
[ 0.1957272  -1.51936052  0.40767963 -0.45513443 -0.48142763 -1.24410407
  2.60391307 -1.93834966 -2.15886687 -1.15053664]

Initial bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Initial action preference function
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Initial policy
[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]

Final bandit mean
[ 0.01430227 -1.07617476 -1.07668153 -0.81885676 -0.64743395 -0.23869974
  0.23653173  0.01623212  1.25593101 -0.67111337]

Final bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Final action preference function
[-0.18786209 -0.89910764 -1.53930444 -0.55416574 -1.1369539  -1.44614132
 -0.05565221 -0.82714036  5.95762718  0.68870053]

Final policy
[2.11096262e-03 1.03654008e-03 5.46449636e-04 1.46350504e-03
 8.17127591e-04 5.99805701e-04 2.40934943e-03 1.11388770e-03
 9.84830374e-01 5.07199772e-03]
