In [1]:
import numpy as np

# Create environment

In [2]:
number_of_bandits = 10

In [3]:
global_bandit_mean_mean = 0.0
global_bandit_mean_variance = 1.0

bandit_mean = np.random.normal(loc = global_bandit_mean_mean, scale = np.sqrt(global_bandit_mean_variance), size = number_of_bandits)
bandit_mean

array([-1.00007121, -1.29275786,  0.42196963,  1.55721517, -0.67232445,
       -0.11010087,  2.54621684, -0.23408509,  0.0900666 ,  0.22306781])

In [4]:
global_bandit_variance_mean = 1.0
global_bandit_variance_variance = 0.0

bandit_variance = np.random.normal(loc = global_bandit_variance_mean, scale = np.sqrt(global_bandit_variance_variance), size = number_of_bandits)
bandit_variance

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [5]:
bandit_stochastic_change_frequencies = np.repeat(a = 201, repeats = number_of_bandits)
bandit_stochastic_change_frequencies

array([201, 201, 201, 201, 201, 201, 201, 201, 201, 201])

In [6]:
bandit_stochastic_change_counter = np.zeros(shape = [number_of_bandits], dtype = np.int64)
bandit_stochastic_change_counter

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Set hyperparameters

In [7]:
# Set the number of iterations
number_of_iterations = 2000
# Set learning rate alpha
alpha = 0.1
# Set epsilon for our epsilon level of exploration
epsilon = 0.1
# Set action selection type (greedy, epsilon-greedy, upper-confidence-bound)
action_selection_type = 1
# Set action value update type (sample-average, biased constant step-size, unbiased constant step-size)
action_value_update_type = 2

# Create value function and policy arrays

In [8]:
action_value_function = np.zeros(shape = [number_of_bandits], dtype = np.float64)
action_value_function

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
action_count = np.zeros(shape = [number_of_bandits], dtype = np.int64)
action_count

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [10]:
action_trace = np.zeros(shape = [number_of_bandits], dtype = np.float64)
action_trace

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
policy = np.repeat(a = 1.0 / number_of_bandits, repeats = number_of_bandits)
policy

array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])

# Create algorithm

In [12]:
# Set random seed so that everything is reproducible
np.random.seed(seed = 0)

In [13]:
# This function loops through iterations and updates the policy
def loop_through_iterations(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_value_function, action_count, action_trace, policy, alpha, epsilon, action_selection_type, action_value_update_type):
    # Loop through iterations until termination
    for t in range(0, number_of_iterations):
        # Choose policy by epsilon-greedy choosing from the action-value function
        policy = update_policy_from_action_value_function(number_of_bandits, action_value_function, action_count, t + 1, epsilon, action_selection_type, policy);

        # Get action
        action_index = np.random.choice(a = number_of_bandits, p = policy)

        # Get reward from action
        reward = np.random.normal(loc = bandit_mean[action_index], scale = np.sqrt(bandit_variance[action_index]))

        # Update action count
        action_count[action_index] += 1

        # Update action-value function
        if action_value_update_type == 0: # sample-average method
            action_value_function[action_index] += (1.0 / action_count[action_index]) * (reward - action_value_function[action_index]);
        elif action_value_update_type == 1: # biased constant step-size
            action_value_function[action_index] += alpha * (reward - action_value_function[action_index]);
        elif action_value_update_type == 2: # unbiased constant step-size
            # Update action trace
            action_trace[action_index] += alpha * (1.0 - action_trace[action_index]);

            action_value_function[action_index] += (alpha / action_trace[action_index]) * (reward - action_value_function[action_index]);

        # Mutate bandit statistics
        for i in range(number_of_bandits):
            if bandit_stochastic_change_frequencies[i] > 0:
                bandit_stochastic_change_counter[i] += 1

                if bandit_stochastic_change_counter[i] == bandit_stochastic_change_frequencies[i]:
                    bandit_mean[i] = np.random.normal(loc = global_bandit_mean_mean, scale = np.sqrt(global_bandit_mean_variance))
                    bandit_variance[i] = np.random.normal(loc = global_bandit_variance_mean, scale = np.sqrt(global_bandit_variance_variance))

                    bandit_stochastic_change_counter[i] = 0

    return bandit_mean, bandit_variance, action_value_function, policy

In [14]:
# This function updates policy as some function of action-value function
def update_policy_from_action_value_function(number_of_bandits, action_value_function, action_count, iteration_count, epsilon, action_selection_type, policy):
    # Calculate action value depending on action selection type
    if action_selection_type == 0 or action_selection_type == 1: # greedy or epsilon-greedy
        action_value = action_value_function[:]
    elif action_selection_type == 2: # upper-confidence-bound
        min_count_idx = np.argmin(a = action_value_function)
        if min_count_idx == 0:
            policy = np.where(np.arange(number_of_bandits) == min_count_idx, 1.0, 0.0)
            return policy
        else:
            action_value = action_value_function + epsilon * np.sqrt(np.log(iteration_count) / action_count)
    
    # Save max action value and find the number of actions that have the same max action value
    max_action_value = np.max(a = action_value_function)
    max_action_count = np.count_nonzero(a = action_value_function == max_action_value)

    # Apportion policy probability across ties equally for state-action pairs that have the same value and zero otherwise
    if action_selection_type == 1: # epsilon-greedy
        if max_action_count == number_of_bandits:
            max_policy_apportioned_probability_per_action = 1.0 / max_action_count
            remaining_apportioned_probability_per_action = 0.0
        else:
            max_policy_apportioned_probability_per_action = (1.0 - epsilon) / max_action_count
            remaining_apportioned_probability_per_action = epsilon / (number_of_bandits - max_action_count)
    elif action_selection_type == 0 or action_selection_type == 2: # greedy or upper-confidence-bound
        max_policy_apportioned_probability_per_action = 1.0 / max_action_count
        remaining_apportioned_probability_per_action = 0.0

    policy = np.where(action_value == max_action_value, max_policy_apportioned_probability_per_action, remaining_apportioned_probability_per_action)

    return policy

In [15]:
def stochastic_multi_armed_banditsloop_through_iterations(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_value_function, action_count, action_trace, policy, alpha, epsilon, action_selection_type, action_value_update_type):
    # Loop through iterations and update the policy
    bandit_mean, bandit_variance, action_value_function, policy = loop_through_iterations(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_value_function, action_count, action_trace, policy, alpha, epsilon, action_selection_type, action_value_update_type)
    
    return bandit_mean, bandit_variance, action_value_function, policy

# Run algorithm

In [16]:
# Print initial arrays
print("\nInitial bandit mean")
print(bandit_mean)

print("\nInitial bandit variance")
print(bandit_variance)

print("\nInitial action value function")
print(action_value_function)

print("\nInitial policy")
print(policy)

# Run on policy temporal difference sarsa
bandit_mean, bandit_variance, action_value_function, policy = stochastic_multi_armed_banditsloop_through_iterations(number_of_iterations, number_of_bandits, bandit_mean, bandit_variance, bandit_stochastic_change_frequencies, bandit_stochastic_change_counter, global_bandit_mean_mean, global_bandit_mean_variance, global_bandit_variance_mean, global_bandit_variance_variance, action_value_function, action_count, action_trace, policy, alpha, epsilon, action_selection_type, action_value_update_type)

# Print final results
print("\nFinal bandit mean")
print(bandit_mean)

print("\nFinal bandit variance")
print(bandit_variance)

print("\nFinal action value function")
print(action_value_function)

print("\nFinal policy")
print(policy)


Initial bandit mean
[-1.00007121 -1.29275786  0.42196963  1.55721517 -0.67232445 -0.11010087
  2.54621684 -0.23408509  0.0900666   0.22306781]

Initial bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Initial action value function
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Initial policy
[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]

Final bandit mean
[ 0.01430227 -1.07617476 -1.07668153 -0.81885676 -0.64743395 -0.23869974
  0.23653173  0.01623212  1.25593101 -0.67111337]

Final bandit variance
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Final action value function
[-0.30564655 -0.66879949 -0.50445807 -0.3821192  -0.30001687 -0.24570694
  0.31200227 -0.24441839 -0.31581366 -0.13650139]

Final policy
[0.01111111 0.01111111 0.01111111 0.01111111 0.01111111 0.01111111
 0.9        0.01111111 0.01111111 0.01111111]
