References:
* [Santa 2020 starter](https://www.kaggle.com/isaienkov/santa-2020-starter/comments): Re-used `writefile` magic command and `make_env` function for creating a simulation.

In [None]:
!pip install kaggle-environments --upgrade -q

## $\epsilon$-greedy without decay

This modifies the default `submission.py` provided by the competition along by using the $\epsilon$-greedy algorithm 

In [None]:
%%writefile epsilon_greedy.py

import math
import random

epsilon = 0.1

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(42)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                upper_bound = sums_of_reward[i] / numbers_of_selections[i]
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

## $\epsilon$-greedy with decay

Same as above with a 0.97 decay factor.

In [None]:
%%writefile epsilon_greedy_decay.py

import math
import random

epsilon = 0.1

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(42)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward    

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                decay = 0.97 ** numbers_of_selections[i]
                upper_bound = decay * sums_of_reward[i] / numbers_of_selections[i]
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

## Simulation: Default vs $\epsilon$-greedy with decay

In [None]:
from kaggle_environments import make
env = make("mab", debug=True)

In [None]:
env.reset()
env.run(["../input/santa-2020/submission.py", "epsilon_greedy_decay.py"])
env.render(mode="ipython", width=800, height=500)

## Simulation: Default vs $\epsilon$-greedy without decay

In [None]:
env.reset()
env.run(["../input/santa-2020/submission.py", "epsilon_greedy.py"])
env.render(mode="ipython", width=800, height=500)

## Simulation: $\epsilon$-greedy with decay vs $\epsilon$-greedy without decay

In [None]:
env.reset()
env.run(["epsilon_greedy_decay.py", "epsilon_greedy.py"])
env.render(mode="ipython", width=800, height=500)

## Best of 5's

In [None]:
def bo5(file1, file2):
    env = make("mab", debug=True)

    for i in range(5):
        env.run([file1, file2])
        p1_score = env.steps[-1][0]['reward']
        p2_score = env.steps[-1][1]['reward']
        env.reset()
        print(f"Round {i+1}: {p1_score} - {p2_score}")

In [None]:
print('Default vs epsilon-greedy')
bo5("../input/santa-2020/submission.py", "epsilon_greedy.py")

In [None]:
print('Default vs epsilon-greedy+decay')
bo5("../input/santa-2020/submission.py", "epsilon_greedy_decay.py")

In [None]:
print('epsilon-greedy vs epsilon-greedy+decay')
bo5("epsilon_greedy.py", "epsilon_greedy_decay.py")