In [3]:
import numpy as np
import sys
sys.path.append("policy/")
sys.path.append("arm/")

In [4]:
from bernoulli import BernoulliArm
from gaussian import GaussianArm

In [4]:
arm1 = BernoulliArm(0.1)
arm2 = BernoulliArm(0.3)
arm3 = BernoulliArm(0.5)
arm4 = BernoulliArm(0.7)
arm5 = BernoulliArm(0.9)

bin_arms = [arm1,arm2,arm3,arm4,arm5]

In [5]:
arm1 = GaussianArm(0.1,0.1)
arm2 = GaussianArm(0.3,0.1)
arm3 = GaussianArm(0.5,0.1)
arm4 = GaussianArm(0.7,0.1)
arm5 = GaussianArm(0.9,0.1)
gauss_arms = [arm1,arm2,arm3,arm4,arm5]

In [None]:
import egreedy

In [5]:
# epsilon greedy 
epsilon_greedy = egreedy.EpsilonGreedy(5,epsilon=0.1)

total_reward = 0
for _ in range(100000):
    draw_arm = epsilon_greedy.select_arm()
    reward = bin_arms[draw_arm].pull()
    total_reward += reward
    epsilon_greedy.update_states(draw_arm,reward)

total_draw = np.sum(epsilon_greedy.drawn_counts)
regret = np.random.binomial(total_draw,0.9) - total_reward
print("epsilon:",epsilon_greedy.epsilon)
print("expected reward:",epsilon_greedy.expected_reward)
print("drawn counts:",epsilon_greedy.drawn_counts)
print("regret:",regret)

epsilon: 0.1
expected reward: [ 0.09031282  0.30493274  0.50203459  0.69713712  0.89920047]
drawn counts: [ 1982  2007  1966  1991 92054]
regret: 4050


In [6]:
# epsilon greedy with decay 
decay_egreedy = egreedy.DecayEpsilonGreedy(5)

total_reward = 0
for _ in range(100000):
    draw_arm = decay_egreedy.select_arm()
    reward = bin_arms[draw_arm].pull()
    total_reward += reward
    decay_egreedy.update_states(draw_arm,reward)

total_draw = np.sum(decay_egreedy.drawn_counts)
regret = np.random.binomial(total_draw,0.9) - total_reward
print("epsilon:",decay_egreedy.get_epsilon())
print("expected reward:",decay_egreedy.expected_reward)
print("drawn counts:",decay_egreedy.drawn_counts)
print("regret:",regret)

epsilon: 0.000499750124938
expected reward: [0.07894736842105265, 0.25757575757575757, 0.4999999999999999, 0.6567164179104478, 0.8990686658236435]
drawn counts: [76, 66, 82, 134, 99642]
regret: 239


In [6]:
import thompson

In [11]:
# Thompson sampling for binary reward

binary_ts = thompson.BinaryThompsonSampling(5)

total_play = 100000
total_reward = 0
for _ in range(total_play):
    i = binary_ts.select_arm()
    reward = bin_arms[i].pull()
    total_reward += reward
    binary_ts.update_states(i,reward)

#total_play = np.sum([binary_ts.alpha,binary_ts.beta]) - 10
regret = np.random.binomial(total_play,0.9) - total_reward
print("alpha:",binary_ts.alpha)
print("beta:",binary_ts.beta)
print("regret:",regret)

alpha: [    1     8    21     9 89845]
beta: [    6     9    14     9 10088]
regret: 186


In [7]:
# Thompson sampling for gaussian(normal) reward 

gaussian_ts = thompson.GaussianThompsonSampling(5)

total_play = 100000
total_reward = 0
for _ in range(total_play):
    i = gaussian_ts.select_arm()
    reward = gauss_arms[i].pull()
    total_reward += reward
    gaussian_ts.update_states(i,reward)
    
regret = np.random.normal(0.9,0.1)
print("mu:",gaussian_ts.mu)
print("k(the number of plays of arms):\n",gaussian_ts.k)
print("regret:",regret)

mu: [ 0.153502    0.13340338  0.26153544  0.33881326  0.36639793]
k(the number of plays of arms):
 [  1.90000000e+01   1.80000000e+01   3.70000000e+01   1.42000000e+02
   9.97840000e+04]
regret: 0.908866509835254


In [12]:
total_regret = 0
n_repeat = 100
for _ in range(n_repeat):
    epsilon_greedy = egreedy.EpsilonGreedy(5,epsilon=0.1)
    total_reward = 0
    for _ in range(10000):
        draw_arm = epsilon_greedy.select_arm()
        reward = bin_arms[draw_arm].pull()
        total_reward += reward
        epsilon_greedy.update_states(draw_arm,reward)
    total_draw = np.sum(epsilon_greedy.drawn_counts)
    regret = np.random.binomial(total_draw,0.9) - total_reward
    total_regret += regret
print("EpsilonGreedy's average regret:",total_regret/n_repeat)

EpsilonGreedy's average regret: 440.11


In [13]:
total_regret = 0
n_repeat = 100
for _ in range(n_repeat):
    decay_egreedy = egreedy.DecayEpsilonGreedy(5)
    total_reward = 0
    for _ in range(10000):
        draw_arm = decay_egreedy.select_arm()
        reward = bin_arms[draw_arm].pull()
        total_reward += reward
        decay_egreedy.update_states(draw_arm,reward)
    total_draw = np.sum(decay_egreedy.drawn_counts)
    regret = np.random.binomial(total_draw,0.9) - total_reward
    total_regret += regret
print("DecayEpsilonGreedy's average regret:",total_regret/n_repeat)

DecayEpsilonGreedy's average regret: 110.9


In [14]:
total_regret = 0
n_repeat = 100
for _ in range(n_repeat):
    tsampling = thompson.BinaryThompsonSampling(5)
    total_reward = 0
    for _ in range(10000):
        i = tsampling.select_arm()
        reward = bin_arms[i].pull()
        total_reward += reward
        tsampling.update_states(i,reward)
    total_play = np.sum([tsampling.alpha,tsampling.beta]) - 10
    regret = np.random.binomial(total_play,0.9) - total_reward
    total_regret += regret
print("TS(binary reward)'s average regret:",total_regret/n_repeat)

TS's average regret: 17.63


In [15]:
total_regret = 0
n_repeat = 100
for _ in range(n_repeat):
    tsampling = thompson.GaussianThompsonSampling(5)
    total_reward = 0
    total_play = 10000
    for _ in range(total_play):
        i = tsampling.select_arm()
        reward = gauss_arms[i].pull()
        total_reward += reward
        tsampling.update_states(i,reward)
    optimal = sum([gauss_arms[-1].pull() for _ in range(total_play)])
    regret = optimal - total_reward
    total_regret += regret
print("TS(gaussian reward)'s average regret:",total_regret/n_repeat)

TS(gaussian reward)'s average regret: 62.848398420765044


In [15]:
import multiplay_thompson

In [16]:
total_regret = 0
n_repeat = 100
for _ in range(n_repeat):
    mp_ts = multiplay_thompson.BinaryMultiplayTS(5,2)
    total_reward = 0
    for _ in range(10000):
        selections = mp_ts.select_arm()
        for i in selections:
            reward = bin_arms[i].pull()
            total_reward += reward
            mp_ts.update_states(i,reward)
    regret = np.random.binomial(10000,0.9)+np.random.binomial(10000,0.7) - total_reward
    total_regret += regret
print("MP-TS's average regret:",total_regret/n_repeat)

MP-TS's average regret: 22.78


In [3]:
import klucb

In [5]:
total_regret = 0
n_repeat = 100
for _ in range(n_repeat):
    kl_ucb = klucb.KLUCB(5)
    total_reward = 0
    for _ in range(10000):
        i = kl_ucb.select_arm()
        reward = bin_arms[i].pull()
        total_reward += reward
        kl_ucb.update_states(i,reward)
    regret = np.random.binomial(10000,0.9) - total_reward
    total_regret += regret
print("KL-UCB's average regret:",total_regret/n_repeat)

KL-UCB's average regret: 78.61


In [17]:
np.random.normal([0.1,0.5],1/np.array([5,10])+1)

array([-0.62647086,  1.12518978])