In [None]:
!pip install kaggle-environments --upgrade -q

# $\varepsilon$-Greedy

In [None]:
%%writefile epsilon_greedy.py

import math
import random
import numpy as np

random.seed(2020)

epsilon = 0.15

total_reward = 0
Q = None
rewards = None
n_chosen = None
last_action = None

# agent
def agent(observation, configuration):
    global Q, rewards, total_reward, n_chosen, last_action
    # initialize
    if observation.step == 0:
        n_chosen = [0] * configuration.banditCount
        rewards = [0] * configuration.banditCount
        Q = [0] * configuration.banditCount
        total_reward = 0
        last_action = random.randint(0, configuration.banditCount - 1)
    
    # update values
    r = observation.reward - total_reward
    n_chosen[last_action] += 1
    Q[last_action] += (r - Q[last_action]) / n_chosen[last_action]
    total_reward += r
    rewards[last_action] += r
    
    # epsilon-greed algorithm
    # incremental update fashion
    
    if random.random() < epsilon:
        # random action, prob epsilon
        action = random.randint(0, configuration.banditCount - 1)
    else:
        # max Q
        action = int(np.argmax(Q))
        
    last_action = action
    
    return action

## Simulation

In [None]:
from kaggle_environments import make
env = make("mab", debug=True)

In [None]:
env.reset()
env.run(["../input/santa-2020/submission.py", "epsilon_greedy.py"])
env.render(mode="ipython", width=800, height=500)

# Upper-Confidence-Bound

In [None]:
%%writefile ucb.py

import math
import random
import numpy as np

random.seed(2020)
np.random.seed(2020)

# params
total_reward = 0
Q = None
rewards = None
n_chosen = None
last_action = None
eps = 1e-9
c = 2

# agent
def agent(observation, configuration):
    global Q, rewards, total_reward, n_chosen, last_action
    # initialize
    if observation.step == 0:
        n_chosen = [0] * configuration.banditCount
        rewards = [0] * configuration.banditCount
        Q = [0] * configuration.banditCount
        total_reward = 0
        last_action = random.randint(0, configuration.banditCount - 1)
    
    # update values
    r = observation.reward - total_reward
    n_chosen[last_action] += 1
    Q[last_action] += (r - Q[last_action]) / n_chosen[last_action]
    total_reward += r
    rewards[last_action] += r
    
    # ucb
    t = observation.step
    Q_ucb = [qa + c * math.sqrt(math.log(t+1) / (na + eps)) for qa, na in zip(Q, n_chosen)]
    action = int(np.argmax(Q_ucb))
    
    last_action = action
    
    return action

## Simulation

In [None]:
env.reset()
env.run(["../input/santa-2020/submission.py", "ucb.py"])
env.render(mode="ipython", width=800, height=500)

# Gradient Bandit

In [None]:
%%writefile gradient_bandit.py

import math
import random
import numpy as np

random.seed(2020)
np.random.seed(2020)

# params
total_reward = 0
H = None
rewards = None
n_chosen = None
last_action = None
pi = None
eps = 1e-9
alpha = 0.1

def policy(H):
    return np.exp(H) / np.sum(np.exp(H))

# agent
def agent(observation, configuration):
    global H, rewards, total_reward, last_action, pi
    # initialize
    if observation.step == 0:
        # init values
        n_chosen = [0] * configuration.banditCount
        rewards = [0] * configuration.banditCount
        # equal prob H
        H = [0] * configuration.banditCount
        pi = policy(H)
        # reward
        total_reward = 0
    else:
        # update values
        r = observation.reward - total_reward
        r_bar = observation.reward / (observation.step + 1)
        # update H
        for a in range(configuration.banditCount):
            if a == last_action:
                H[a] += alpha * (r - r_bar) * (1 - pi[a])
            else:
                H[a] -= alpha * (r - r_bar) * pi[a]
        
        # update rewards
        total_reward += r
        rewards[last_action] += r

    # update policy
    pi = policy(H)
    action = int(np.argmax(pi))    
    last_action = action
    
    return action

## Simulation

In [None]:
env.reset()
env.run(["../input/santa-2020/submission.py", "gradient_bandit.py"])
env.render(mode="ipython", width=800, height=500)