In [None]:
import numpy as np
import matplotlib.pyplot as plt

from functools import partial

In [None]:
def get_distribution(mu=0, sigma=1):
    return partial(np.random.normal, mu, sigma)

def argmax(data):
    index = 0
    maximum = data[0]
    
    for i, value in enumerate(data):
        if value > maximum:
            index = i
            maximum = value
        
    return index

In [None]:
class Button:
    
    def __initialize_distrubution(self, mu, sigma):
        self.__distribution = get_distribution(mu=mu, sigma=sigma)
    
    def __init__(self, mu=0, sigma=1):
        self.__mu = mu
        self.__sigma = sigma
        
        self.__initialize_distrubution(mu, sigma)
        
        
    def __call__(self, size):
        return self.__distribution(size)
    

    @property
    def mu(self):
        return self.__mu
    
    @property
    def sigma(self):
        return self.__sigma
    
    @mu.setter
    def mu(self, value):
        self.__mu = mu
        
        self.__initialize_distrubution(mu, sigma)
    
    @sigma.setter
    def sigma(self, value):
        self.__sigma = sigma
        
        self.__initialize_distrubution(mu, sigma)
        
    def __repr__(self):
        return f'Button(mu={self.__mu}, sigma={self.__sigma})'

In [None]:
class Agent:
    def __init__(self, buttons_list, alpha):
        self.__buttons = buttons_list
        self.__alpha = alpha
        
        self.__log = []
        self.__Q = [0 for _ in range(BUTTONS_COUNT)]
        
    def __call__(self):
        ...    
    
    @property
    def log(self):
        return self.__log
    
    @property
    def Q(self):
        return self.__Q
    
    @property
    def alpha(self):
        return self.__alpha
    
    @alpha.setter
    def alpha(self, value):
        self.__alpha = value

In [None]:
class EpsilonGridyAgent(Agent):
    def __init__(self, buttons_list, epsilon=0.05, alpha=0.01):
        super(EpsilonGridyAgent, self).__init__(buttons_list, alpha)
        
        self.__epsilon = epsilon
    
    @property
    def epsilon(self):
        return self.__epsilon
    
    @epsilon.setter
    def epsilon(self, value):
        if value >= 0 and value <= 1:
            self.__epsilon = epsilon
           
    def __move(self, button):
        value = self._Agent__buttons[button](1)[0]
        self._Agent__log.append(value)
        
        self._Agent__Q[button] = self._Agent__Q[button] + self._Agent__alpha * (value - self._Agent__Q[button])
    
    def __explore(self):
        move = np.random.randint(BUTTONS_COUNT)
        self.__move(move)
        
    def __exploit(self):
        maximum = argmax(self._Agent__Q)
        self.__move(maximum)
    
    def __call__(self):
        value = np.random.uniform(0, 1)
                
        if value < self.__epsilon:
            self.__explore()
        else:
            self.__exploit()

In [None]:
class OptimisticInitialValueAgent(Agent):
    def __init__(self,buttons_list, alpha=0.01):
        super(OptimisticInitialValueAgent, self).__init__(buttons_list, alpha)
        
        for i in range(BUTTONS_COUNT):
            self._Agent__Q[i] = MAXIMUM_MU + 1
        
        self._Agent_alpha = alpha     

        
    def __call__(self):
        button = argmax(self._Agent__Q)
        value = self._Agent__buttons[button](1)[0]
        
        self._Agent__log.append(value)
        self._Agent__Q[button] = self._Agent__Q[button] + self._Agent__alpha * (value - self._Agent__Q[button])
        

In [None]:
class UpperConfidenceBoundAgnent(Agent):
    def __init__(self,buttons_list, c=1, alpha=0.01):
        super(UpperConfidenceBoundAgnent, self).__init__(buttons_list, alpha)
        
        self.__c = c    
        self.__N = [1 for _ in range(BUTTONS_COUNT)]
        self.__counter = 0
           
    @property
    def c(self):
        return self.__c
  

    @c.setter
    def c(self, value):
        self.__alpha = value
        
    def __new_Q(self, index):
        return self._Agent__Q[index] + self.__c * np.sqrt(np.log(self.__counter) / self.__N[index])
    
    def __call__(self):
        self.__counter += 1
        new_values = [self.__new_Q(i) for i in range(BUTTONS_COUNT)]
        
        button = argmax(new_values)
        value = self._Agent__buttons[button](1)[0] 
        self.__N[button] += 1
        
        self._Agent__log.append(value)
        
        self._Agent__Q[button] = self._Agent__Q[button] + self._Agent__alpha * (value - self._Agent__Q[button])

In [None]:
BUTTONS_COUNT = 4
MINIMUM_MU = 0
MAXIMUM_MU = 10
ITERATIONS = 10000

In [None]:
buttons = [Button(mu=np.random.uniform(low=MINIMUM_MU, high=MAXIMUM_MU)) for _ in range(BUTTONS_COUNT)]

In [None]:
epsilon_greedy = EpsilonGridyAgent(buttons)
optimistic_initial_value = OptimisticInitialValueAgent(buttons)
UCB = UpperConfidenceBoundAgnent(buttons)

In [None]:
def log():
    print(f'epsilon greedy: {epsilon_greedy_log.mean()}')
    print(f'optimistic initial value {optimistic_initial_value_log.mean()}')
    print(f'UCB {UCB_log.mean()}')

In [None]:
for i in range(ITERATIONS):
    if i % 500 == 0:
        print(i)
        log()
        print('-' * 50)
        
    epsilon_greedy()
    optimistic_initial_value()
    UCB()

In [None]:
epsilon_greedy_log = np.array(epsilon_greedy.log)
optimistic_initial_value_log = np.array(optimistic_initial_value.log)
UCB_log = np.array(UCB.log)

In [None]:
plt.plot(epsilon_greedy.log, color='blue')
plt.plot(optimistic_initial_value.log, color='red')
plt.plot(UCB.log, color='green')

In [None]:
log()

In [None]:
for i in range(BUTTONS_COUNT):
    print(buttons[i])
    print(f'epsilon greedy: {epsilon_greedy.Q[i]}')
    print(f'optimistic initial value: {optimistic_initial_value.Q[i]}')
    print(f'UCB: {UCB.Q[i]}')
    print('-' * 50)