In [1]:
import numpy as np
import sys
sys.path.append("policy/")
sys.path.append("arm/")
import egreedy
from bernoulli import BernoulliArm

In [2]:
arm1 = BernoulliArm(0.1)
arm2 = BernoulliArm(0.3)
arm3 = BernoulliArm(0.5)
arm4 = BernoulliArm(0.7)
arm5 = BernoulliArm(0.9)

arms = [arm1,arm2,arm3,arm4,arm5]

In [3]:
epsilon_greedy = egreedy.EpsilonGreedy(5,epsilon=0.1)
decay_egreedy = egreedy.DecayEpsilonGreedy(5)

In [4]:
total_reward = 0
for _ in range(100000):
    draw_arm = epsilon_greedy.select_arm()
    reward = arms[draw_arm].pull()
    total_reward += reward
    epsilon_greedy.update_states(draw_arm,reward)

In [5]:
total_draw = np.sum(epsilon_greedy.drawn_counts)
regret = np.random.binomial(total_draw,0.9) - total_reward
print("epsilon:",epsilon_greedy.epsilon)
print("expected reward:",epsilon_greedy.expected_reward)
print("drawn counts:",epsilon_greedy.drawn_counts)
print("regret:",regret)

epsilon: 0.1
expected reward: [0.09849587578845229, 0.2934131736526947, 0.49081237911025083, 0.7041910331384024, 0.8986549038828047]
drawn counts: [2061, 2004, 2068, 2052, 91815]
regret: 4290


In [6]:
total_reward = 0
for _ in range(100000):
    draw_arm = decay_egreedy.select_arm()
    reward = arms[draw_arm].pull()
    total_reward += reward
    decay_egreedy.update_states(draw_arm,reward)

In [7]:
total_draw = np.sum(decay_egreedy.drawn_counts)
regret = np.random.binomial(total_draw,0.9) - total_reward
print("epsilon:",decay_egreedy.get_epsilon())
print("expected reward:",decay_egreedy.expected_reward)
print("drawn counts:",decay_egreedy.drawn_counts)
print("regret:",regret)

epsilon: 0.000499750124938
expected reward: [0.12328767123287673, 0.3026315789473686, 0.4810126582278482, 0.6989247311827957, 0.9000892866100173]
drawn counts: [73, 76, 79, 93, 99679]
regret: 192


In [8]:
import thompson

In [9]:
tsampling = thompson.BinaryThompsonSampling(5)

In [10]:
total_reward = 0
for _ in range(100000):
    i = tsampling.select_arm()
    reward = arms[i].pull()
    total_reward += reward
    tsampling.update_states(i,reward)

In [11]:
total_play = np.sum([tsampling.alpha,tsampling.beta]) - 10
regret = np.random.binomial(total_play,0.9) - total_reward
print("alpha:",tsampling.alpha)
print("beta:",tsampling.beta)
print("regret:",regret)

alpha: [1, 1, 11, 4, 90073]
beta: [5, 5, 10, 7, 9893]
regret: 36


In [12]:
total_regret = 0
for _ in range(100):
    epsilon_greedy = egreedy.EpsilonGreedy(5,epsilon=0.1)
    total_reward = 0
    for _ in range(10000):
        draw_arm = epsilon_greedy.select_arm()
        reward = arms[draw_arm].pull()
        total_reward += reward
        epsilon_greedy.update_states(draw_arm,reward)
    total_draw = np.sum(epsilon_greedy.drawn_counts)
    regret = np.random.binomial(total_draw,0.9) - total_reward
    total_regret += regret
print("EpsilonGreedy's average regret:",total_regret/100)

EpsilonGreedy's average regret: 428.36


In [13]:
total_regret = 0
for _ in range(100):
    decay_egreedy = egreedy.DecayEpsilonGreedy(5)
    total_reward = 0
    for _ in range(10000):
        draw_arm = decay_egreedy.select_arm()
        reward = arms[draw_arm].pull()
        total_reward += reward
        decay_egreedy.update_states(draw_arm,reward)
    total_draw = np.sum(decay_egreedy.drawn_counts)
    regret = np.random.binomial(total_draw,0.9) - total_reward
    total_regret += regret
print("DecayEpsilonGreedy's average regret:",total_regret/100)

DecayEpsilonGreedy's average regret: 120.39


In [3]:
total_regret = 0
for _ in range(100):
    tsampling = thompson.BinaryThompsonSampling(5)
    total_reward = 0
    for _ in range(10000):
        i = tsampling.select_arm()
        reward = arms[i].pull()
        total_reward += reward
        tsampling.update_states(i,reward)
    total_play = np.sum([tsampling.alpha,tsampling.beta]) - 10
    regret = np.random.binomial(total_play,0.9) - total_reward
    total_regret += regret
print("TS's average regret:",total_regret/100)

NameError: name 'thompson' is not defined

In [4]:
import multiplay_thompson

In [5]:
mp_ts = multiplay_thompson.BinaryMultiplayTS(5,2)
total_reward = 0
for _ in range(100000):
    selections = mp_ts.select_arm()
    for i in selections:
        reward = arms[i].pull()
        total_reward += reward
        mp_ts.update_states(i,reward)

In [6]:
total_regret = 0
for _ in range(100):
    mp_ts = multiplay_thompson.BinaryMultiplayTS(5,2)
    total_reward = 0
    for _ in range(10000):
        selections = mp_ts.select_arm()
        for i in selections:
            reward = arms[i].pull()
            total_reward += reward
            mp_ts.update_states(i,reward)
    regret = np.random.binomial(10000,0.9)+np.random.binomial(10000,0.7) - total_reward
    total_regret += regret
print("MP-TS's average regret:",total_regret/100)

MP-TS's average regret: 19.51
