References:
* [Santa 2020 starter](https://www.kaggle.com/xhlulu/santa-2020-epsilon-greedy-starter): all lines identical, except for the decay

The above referenced $\epsilon$-greedy algorithm accounts for the decay in threshold of bandits by applying a $0.97^{n_i}$ upper bound for the reward probability. Here $n_i$ is the number of pulls **by our agent** on the $i^{th}$ bandit. 

In the simulation both player's selections contribute to the decay, hence when calculating the upper bound, the total number of selection should be used.

However, when playing against each other, the second version significantly underperforms. Naively I would expect the opposite, but maybe I'm missing something?

**please discuss**


In [None]:
!pip install kaggle-environments --upgrade -q

## $\epsilon$-greedy with decay including own steps only (original version)


In [None]:
%%writefile epsilon_greedy_decay_own.py

import math
import random
import numpy as np

epsilon = 0.1

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(42)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward    

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                decay = 0.97 ** numbers_of_selections[i]
                upper_bound = decay * sums_of_reward[i] / numbers_of_selections[i]
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

## $\epsilon$-greedy with decay including own and opponent steps



In [None]:
%%writefile epsilon_greedy_decay_both.py

import math
import random
import numpy as np

epsilon = 0.1

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
numbers_of_total_selections = None
random.seed(42)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, numbers_of_total_selections, last_bandit, total_reward    

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        numbers_of_total_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward
        for action in observation.lastActions:
            numbers_of_total_selections[int(action)] += 1

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                decay = 0.97 ** numbers_of_total_selections[i]
                upper_bound = decay * sums_of_reward[i] / numbers_of_selections[i]
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

> ## Best of 10's

In [None]:
from kaggle_environments import make
import matplotlib.pyplot as plt
import numpy as np

def bo5(file1, file2):
    env = make("mab", debug=True)

    rewards_1 = []
    rewards_2 = []
    for i in range(10):
        print('.', end='')
        env.run([file1, file2])
        rewards_1.append([])
        rewards_2.append([])
        for step in env.steps:
            rewards_1[-1].append(step[0]['reward'])
            rewards_2[-1].append(step[1]['reward'])
        env.reset()
        
    rewards_1 = np.mean(rewards_1, axis=0)
    rewards_2 = np.mean(rewards_2, axis=0)
    
    fig, ax = plt.subplots(2, 1)
    ax[0].set_title("Rewards")
    ax[0].plot(rewards_2, label=f"{file2}")
    ax[0].plot(rewards_1, label=f"{file1}")
    ax[1].plot(rewards_1-rewards_2, label='difference', color='tab:orange')
    ax[1].axhline(0, ls='--', color='xkcd:dark red')
    for i in range(2):
        ax[i].legend()
            

In [None]:
print('Default vs epsilon-greedy_decay_own')
bo5("epsilon_greedy_decay_own.py", "../input/santa-2020/submission.py")

In [None]:
print('Default vs epsilon-greedy_decay_both')
bo5("epsilon_greedy_decay_both.py", "../input/santa-2020/submission.py")

In [None]:
print('epsilon-greedy_both vs epsilon-greedy_decay_own')
bo5("epsilon_greedy_decay_both.py", "epsilon_greedy_decay_own.py")