In [1]:
import random

REWARD_CLICK = 1
REWARD_NO_CLICK = 0


class WebsiteEnvironmentSimple:
    def __init__(self, proba):
        self.proba = proba

    def do(self, action):
        reward = REWARD_CLICK if random.random() < self.proba[action] else REWARD_NO_CLICK
        return reward

In [2]:
from abc import ABC, abstractmethod


class BanditBase(ABC):
    def __init__(self, K, env):
        self.arms = list(range(K))
        self.env = env
        self.history = []

    @abstractmethod
    def get_action(self):
        raise NotImplementedError()

    def update(self, arm, reward):
        self.history.append([arm, reward])
        self._update(arm, reward)

    @abstractmethod
    def _update(self, arm, reward):
        raise NotImplementedError()


In [75]:
import random
import numpy as np
import math
from abc import abstractmethod

class UGapEBandit(BanditBase):
    def __init__(self, ϵ, m, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ϵ = ϵ
        self.m = m
        
        self.rewards = {arm: [] for arm in self.arms}
    
    def get_action(self):
        # First, all arms have to be initialized by playing them once
        for arm, rewards in self.rewards.items():
            if len(rewards) == 0:
                return arm
            
        B_t = self.get_regret_bound()
        J_t = np.argsort(B_t)[::-1][:self.m]  # [::-1] reverses sorting
        # arm = pull arm
        arm = random.choice(self.arms)
        return arm
    
    def get_regret_bound(self):
        arm_reward_bound = []
        
        for arm in self.arms:
            beta = self.get_arm_beta(arm)
            mean_reward = np.mean(self.rewards[arm])
            bound = (mean_reward - beta, mean_reward + beta)
            arm_reward_bound.append(bound)
        
        arm_regret_bound = []
        for k in set(self.arms):
            values = []
            lower_k = arm_reward_bound[k][0]
            for i in set(self.arms) - {k}:
                upper_i = arm_reward_bound[i][1]
                values.append(upper_i - lower_k)
            arm_regret_bound.append(max(values))
        
        return arm_regret_bound
    
    def _update(self, arm, reward):
        self.rewards[arm].append(reward)
    
    @abstractmethod
    def get_arm_beta(self, arm):
        raise NotImplementedError()

class UGapEBudgetBandit(UGapEBandit):
    def __init__(self, ϵ, m, n, a, *args, **kwargs):
        super().__init__(ϵ=ϵ, m=m, *args, **kwargs)
        self.n = n
        self.a = a
    
    def get_arm_beta(self, arm):
        # TODO: b dynamic somehow, dependent on env'
        #   in our case, b is just 1
        
        # T_k(t-1) is number of times arm k has been played
        # up until t-1, which we can compute by the length
        # of rewards list for the arm
        return math.sqrt(self.a / len(self.rewards[arm]))

class UGapEConfidenceBandit(UGapEBandit):
    def __init__(self, ϵ, m, δ, c, *args, **kwargs):
        super().__init__(ϵ=ϵ, m=m, *args, **kwargs)
        self.δ = δ
        self.c = c


In [76]:
conversion_rates = [0.15, 0.13]
env = WebsiteEnvironmentSimple(conversion_rates)

In [77]:
bandit = UGapEBudgetBandit(ϵ=0.3, m=1, n=2, a=1, K=2, env=env)

In [86]:
for _ in range(5):
    action = bandit.get_action()
    reward = env.do(action)
    bandit.update(action, reward)

[1]
[0]
[0]
[0]
[0]


In [87]:
bandit.rewards

{0: [0, 0, 0, 0, 0, 0, 0, 0], 1: [0, 0, 0, 0, 0, 1, 0]}

In [88]:
B_t = bandit.get_regret_bound()

In [89]:
B_t

[0.8743750064596438, 0.5886607207453581]

In [90]:
np.argsort(B_t)[::-1][1]

1

In [53]:
math.sqrt(a / len(bandit.rewards[0]))

0.5

In [54]:
0.25-0.5

-0.25