In [5]:
import gym_bandits
import gym
import numpy as np

In [2]:
env = gym.make("BanditTwoArmedHighLowFixed-v0")

In [3]:
print(env.action_space.n)

2


In [4]:
print(env.p_dist)

[0.8, 0.2]


# Exploration Strategies

## Epsilon-greedy

In [6]:
count = np.zeros(2)
sum_rewards = np.zeros(2)
Q = np.zeros(2)

In [7]:
num_rounds = 100

In [8]:
def epsilon_greedy(epsilon):
    if np.random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q)

In [9]:
for i in range(num_rounds):
    arm = epsilon_greedy(epsilon=0.5)
    next_state, reward, done, info = env.step(arm)
    count[arm] += 1
    sum_rewards[arm] += reward
    Q[arm] = sum_rewards[arm] / count[arm]

In [10]:
Q

array([0.88461538, 0.13636364])

In [12]:
print('The optimal arm is arm {}'.format(np.argmax(Q)+1))

The optimal arm is arm 1


## Softmax Exploration

In [13]:
count = np.zeros(2)
sum_rewards = np.zeros(2)
Q = np.zeros(2)

In [14]:
num_rounds = 100

In [15]:
def softmax(T):
    probs = np.array([np.exp(i/T) for i in Q])
    return probs / np.sum(probs)

In [16]:
def choose(probs):
    arm = np.random.choice(env.action_space.n, p=probs)
    return arm

In [17]:
def pick(T):
    probs = softmax(T)
    return choose(probs)

In [18]:
T = 50

In [19]:
for i in range(num_rounds):
    arm = pick(T)
    next_state, reward, done, info = env.step(arm)
    count[arm] += 1
    sum_rewards[arm] += reward
    Q[arm] = sum_rewards[arm] / count[arm]
    T = T*0.99

In [20]:
Q

array([0.70833333, 0.17307692])

In [21]:
print('The optimal arm is arm {}'.format(np.argmax(Q)+1))

The optimal arm is arm 1


## Upper Confidence Bound

In [22]:
count = np.zeros(2)
sum_rewards = np.zeros(2)
Q = np.zeros(2)

In [23]:
num_rounds = 100

In [24]:
def UCB(i):
    ucb = np.zeros(2)
    if i < 2:
        return i
    for arm in range(env.action_space.n):
        ucb[arm] = Q[arm] + np.sqrt(2 * np.log(sum(count)) / count[arm])
    return np.argmax(ucb)

In [25]:
for i in range(num_rounds):
    arm = UCB(i)
    next_state, reward, done, info = env.step(arm)
    count[arm] += 1
    sum_rewards[arm] += reward
    Q[arm] = sum_rewards[arm] / count[arm]

In [26]:
Q

array([0.79545455, 0.25      ])

In [27]:
print('The optimal arm is arm {}'.format(np.argmax(Q)+1))

The optimal arm is arm 1


## Thompson Sampling

In [33]:
count = np.zeros(2)
sum_rewards = np.zeros(2)
Q = np.zeros(2)

In [34]:
alpha = np.ones(2)
beta = np.ones(2)

In [35]:
num_rounds = 100

In [36]:
def thompson_sampling(alpha, beta):
    samples = [np.random.beta(alpha[i]+1, beta[i]+1) for i in range(2)]
    return np.argmax(samples)

In [37]:
for i in range(num_rounds):
    arm = thompson_sampling(alpha, beta)
    next_state, reward, done, info = env.step(arm)
    count[arm] += 1
    sum_rewards[arm] += reward
    Q[arm] = sum_rewards[arm] / count[arm]
    
    if reward==1:
        alpha[arm] = alpha[arm] + 1
    else:
        beta[arm] = beta[arm] + 1

In [38]:
print('The optimal arm is arm {}'.format(np.argmax(Q)+1))

The optimal arm is arm 1


In [39]:
Q

array([0.86666667, 0.5       ])