In [1]:
import random
from itertools import product

In [2]:
COMBINATIONS = list(product(*[[1,2] for _ in range(3)]))

In [3]:
P_CLICK = [0.1, 0.12, 0.09, 0.095, 0.06, 0.01, 0.05, 0.08]

In [4]:
class Env:
    def play(self, action):
        return int(random.random() < P_CLICK[action])

In [5]:
env = Env()

In [6]:
action = 0
print('Combination', COMBINATIONS[action])
print('P(Click)', P_CLICK[action])

Combination (1, 1, 1)
P(Click) 0.1


In [7]:
NUMBER_OF_ARMS = len(COMBINATIONS)

In [8]:
from copy import copy

In [9]:
import math

In [10]:
class UGapEb:
    def __init__(self, ϵ, m, n, a):
        self.ϵ = ϵ
        self.m = m
        self.n = n
        self.a = a
        
        self.history = {arm: [] for arm in range(NUMBER_OF_ARMS)}
        self.play_counts = []
        self.current_play_count = [0 for _ in range(NUMBER_OF_ARMS)]
        self.env = Env()
        
        self.β_history = []
        self.μ_hat_history = []
    
    def run(self):
        for arm in range(NUMBER_OF_ARMS):
            reward = self.env.play(arm)
            self.update(arm, reward)
            
        for t in range(NUMBER_OF_ARMS, self.n):
            self.select_arm(t)
    
    def select_arm(self, t):
        B = self.compute_B()
        print(B)
    
    def update(self, arm, reward):
        self.history[arm].append(reward)
        self.current_play_count[arm] += 1
        self.play_counts.append(copy(self.current_play_count))
        
        # what is b? here it's just 1
        β = [
            0 if self.current_play_count[arm] == 0 else 1 * math.sqrt(self.a / self.current_play_count[arm])
            for arm in range(NUMBER_OF_ARMS)
        ]
        self.β_history.append(β)
        
        μ_hat = [
            0 if self.current_play_count[arm] == 0 else sum(self.history[arm]) / self.current_play_count[arm]
            for arm in range(NUMBER_OF_ARMS)
        ]
        self.μ_hat_history.append(μ_hat)
        
    def compute_B(self):
        U = [self.μ_hat_history[-2][arm] + self.β_history[-2][arm] for arm in range(NUMBER_OF_ARMS)]
        L = [self.μ_hat_history[-2][arm] - self.β_history[-2][arm] for arm in range(NUMBER_OF_ARMS)]
        
        B = []
        for arm in range(NUMBER_OF_ARMS):
            U_not_arm = []
            for i in range(NUMBER_OF_ARMS):
                if i == arm:
                    continue
                U_not_arm.append(U[arm])
                
            U_max_not_arm = max(U_not_arm)
            B.append(U_max_not_arm - L[arm])
        
        return B 

In [11]:
algo = UGapEb(1, 1, 10, 1)
algo.run()

[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0]
[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0]


In [12]:
algo.μ_hat_history

[[0.0, 0, 0, 0, 0, 0, 0, 0],
 [0.0, 1.0, 0, 0, 0, 0, 0, 0],
 [0.0, 1.0, 0.0, 0, 0, 0, 0, 0],
 [0.0, 1.0, 0.0, 0.0, 0, 0, 0, 0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0, 0, 0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0, 0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]]