This program implement the UCB algorithm to find 
the best arm in a multi-arm bandit problem.

In [None]:
# Importing the necessary libraries:

import gym
import gym_bandits
import numpy as np

In [None]:
# Create the bandit environment
env = gym.make("BanditTwoArmedHighLowFixed-v0")

In [None]:
# Check the probability distribution of the arm:
print(env.p_dist)

In [None]:
# Initialize the count for storing 
# the number of times an arm is pulled:
count = np.zeros(2)

In [None]:
# Initialize the sum_rewards for storing 
# the sum of rewards of each arm:
sum_rewards = np.zeros(2)

In [None]:
# Initialize Q for storing the average 
# reward of each arm:
Q = np.zeros(2)

In [None]:
# Define num_rounds number of rounds (iterations):
num_rounds = 100

In [None]:
# Defining the UCB function
def UCB(i):
    
    # Initialize the numpy array for storing the UCB of all the arms
    ucb = np.zeros(2)
    
    # Before computing the UCB, we explore all the arms at least once, 
    # so for the first 2 rounds,we directly select the arm 
    # corresponding to the round number
    if i < 2:
        return i
    
    # if the round is greater than 10 then, 
    # we compute the UCB of all the arms as specified in the
    # equation (1) and return the arm which has the highest UCB:
    else:
        for arm in range(2):
            ucb[arm] = Q[arm] + np.sqrt((2*np.log(sum(count))) / count[arm])
        return (np.argmax(ucb))

In [None]:
# Start pulling the arm
# Let's play the game and try to 
# find the best arm using the UCB method.

for i in range(num_rounds):
    
    # select the arm based on the UCB method
    arm = UCB(i)

    # pull the arm and store the reward and next state information
    next_state, reward, done, info = env.step(arm) 

    # increment the count of the arm by 1
    count[arm] += 1
    
    # update the sum of rewards of the arm
    sum_rewards[arm]+=reward

    # update the average reward of the arm
    Q[arm] = sum_rewards[arm]/count[arm]
    

In [None]:
# After all the rounds, we take a look at the 
# average reward obtained from each of the arms:

print(Q)

In [None]:
# We can select the optimal arm as the one 
# which has a maximum average reward.

print('The optimal arm is arm {}'.format(np.argmax(Q)+1))