References:
* [Santa 2020 starter](https://www.kaggle.com/isaienkov/santa-2020-starter/): Re-used writefile magic command and make_env function for creating a simulation.  
* [Lilian's blog post](https://lilianweng.github.io/lil-log/2018/01/23/the-multi-armed-bandit-problem-and-its-solutions.html): Bayesian Implementation of UCB

In [None]:
!pip install kaggle-environments --upgrade -q

## UCB with Decay

The classic UCB implementation (derived from the original implementation) with a decay factor.

In [None]:
%%writefile ucb_decay.py

import numpy as np

decay = 0.97
total_reward = 0
bandit = None

def agent(observation, configuration):
    global reward_sums, n_selections, total_reward, bandit
    
    n_bandits = configuration.banditCount

    if observation.step == 0:
        n_selections, reward_sums = np.full((2, n_bandits), 1e-32)
    else:
        reward_sums[bandit] += decay * (observation.reward - total_reward)
        total_reward = observation.reward

    avg_reward = reward_sums / n_selections    
    delta_i = np.sqrt(2 * np.log(observation.step + 1) / n_selections)
    bandit = int(np.argmax(avg_reward + delta_i))

    n_selections[bandit] += 1

    return bandit

## Bayesian UCB

Based on Lilian's blog post.

In [None]:
%%writefile bayesian_ucb.py

import numpy as np
from scipy.stats import beta

post_a, post_b, bandit = [None] * 3
total_reward = 0
c = 3

def agent(observation, configuration):
    global total_reward, bandit, post_a, post_b, c

    if observation.step == 0:
        post_a, post_b = np.ones((2, configuration.banditCount))
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward
        # Update Gaussian posterior
        post_a[bandit] += r
        post_b[bandit] += 1 - r
    
    bound = post_a / (post_a + post_b) + beta.std(post_a, post_b) * c
    bandit = int(np.argmax(bound))
    
    return bandit

## Simulations

In [None]:
from kaggle_environments import make

env = make("mab", debug=True)

Default vs UCB+Decay:

In [None]:
env.reset()
env.run(["../input/santa-2020/submission.py", "ucb_decay.py"])
env.render(mode="ipython", width=800, height=500)

Default vs bayesian UCB:

In [None]:
env.reset()
env.run(["../input/santa-2020/submission.py", "bayesian_ucb.py"])
env.render(mode="ipython", width=800, height=500)

## 5-round comparison

In [None]:
def print_rounds(file1, file2, N=5):
    env = make("mab", debug=True)

    for i in range(N):
        env.run([file1, file2])
        p1_score = env.steps[-1][0]['reward']
        p2_score = env.steps[-1][1]['reward']
        env.reset()
        print(f"Round {i+1}: {p1_score} - {p2_score}")

In [None]:
print('Default vs UCB+decay')
print_rounds("../input/santa-2020/submission.py", "ucb_decay.py")

In [None]:
print('Default vs BayesianUCB')
print_rounds("../input/santa-2020/submission.py", "bayesian_ucb.py")