# <h><center>Predicting Thresholds by Emulating Bandits</center></h>

<p> Hello Everyone! <br> Just publishing an idea that came to my mind as I was enjoying some nice tea early morning, but unfortunately didn't perform as well as I had hoped it would. Regardless, It felt a little unique and I still think it has a lot of potential, that I'm not doing proper justice towards. </p>

So Here's the plan:

We assume the bandits initial threshold to be X. Each time the bandit is pulled, X decreases by 3%, i.e, the threshold now becomes \\(X * 0.97 \\) . We can then generalize the threshold at any point to 
<br> <center> $$ X_{initial} * 0.97 ^ {n_{pulls}}$$ <br> <br>
 
Now this was basic knowledge, but how do we know what X is? We *assume* multiple values for X, then do some trial runs, and find out which fits best. For a quick example, let's take some bandit, which has been pulled 7 times, 4 times by us, thrice by opponent, and has given us 2 candies. Let the order of pulls be such: <br> 
    <b><center> P, P, O, O, P, O, P (P= Us, O = Opp)</center></b>
<br> Now we run step by step, everytime we hit an opponents pull, we decrease X by 3%. When we hit our pull, we 'pull' from the bandit, and then decrease the threshold by 3%. We run this hundreds of times for each X, and try to find out the value of X which would be most plausible for our scenario. Here are the results: 


In [None]:
import random
success = 2
vals, pattern = [], ['p', 'p', 'o', 'o','p','o','p']

for x in range(0,101):
    pulls, avg = 0, 0
    for step in pattern:
        if step == 'o':
            pulls += 1
        if step == 'p':
            chance = x*(0.97**pulls)/100
            avg += sum(random.choices([1, 0], k=100, weights=[chance, 1-chance]))/100
            pulls += 1
    vals.append(avg)

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(vals, label='Avg Candies in 100 trials')
ax.plot([2]*101, linewidth=10, alpha=0.5, label='Actual Candies acquired in the given conditions')

best_score, X = 100, 0
for x in range(len(vals)):
    if abs(vals[x] - success) < best_score:
        X = x
        best_score = abs(vals[x] - success)

ax.scatter([X], [success], color='black', label=f'X = {X}, Avg = {vals[X]}')
ax.set_xlabel('Initial Threshold -->')
ax.set_ylabel('Avg. Candies after 100 trial runs -->')
ax.set_title('PulledSelf=4, Opponent=3, Candies=2')
ax.legend()

As can be seen above, the most probable value of initial threshold is 54, then, present threshold of the bandit, will be: <br> <center>
$$ X_{final} = X_{initial}*0.97 ^ {n_{pulls}}$$<br> 
            $$ = 54 * 0.97 ^ {7}$$ <br>
            $$ = 43.6 $$</center> <br>
 Now let's play a few matches against some other agents :)
 
 Code: 

In [None]:
%%writefile submission.py

import pandas, numpy, random

bandits = [{'pulled': 0, 'success': 0, 'opp': 0} for i in range(100)]
last_reward = 0
probabs = [1 for i in range(100)]
patterns = [[] for i in range(100)]
opp_actions = []
my_actions = []
arr = []

def update(obs):
    """Just updating stuff, nothing interesting here :/ """
    global bandits, last_reward, patterns, opp_actions, my_actions, arr
    
    bandits[obs.lastActions[obs.agentIndex]]['pulled'] += 1
    my_actions.append(obs.lastActions[obs.agentIndex])
    
    bandits[obs.lastActions[1 - obs.agentIndex]]['opp'] += 1
    opp_actions.append(obs.lastActions[1 - obs.agentIndex])
    
    bandits[obs.lastActions[obs.agentIndex]]['success'] += (obs.reward > last_reward)*1
    
    patterns[obs.lastActions[obs.agentIndex]].append('s')
    patterns[1 - obs.lastActions[obs.agentIndex]].append('o')
    
    arr.append(probabs[obs.lastActions[obs.agentIndex]])
    

    
def compute(obs):
    """Updating the threshold of the bandit pulled in the previous step"""
    global probabs, bandits, last_reward, patterns
    
    best_probab = 100
    best_score = 100
    ind = obs.lastActions[obs.agentIndex]
    success = bandits[ind]['success']
            
    for i in range(0, 102, 2):
        pulls, avg = 0, 0
        for step in patterns[ind]:
            if step == 'o':
                pulls += 1
            if step == 's':
                chance = i*(0.97**pulls)/100
                avg += sum(random.choices([1, 0], k=100, weights=[chance, 1-chance]))/100
                pulls += 1
        
        if abs(avg - success) < best_score:
            best_score = abs(avg - success)
            best_probab = i
    
#     print("Updating ", ind, "From", probabs[ind], "to", best_probab*(0.99**(bandits[ind]['pulled'] + bandits[ind]['opp']))/100)
    probabs[ind] = (best_probab)*(0.97**(bandits[ind]['pulled'] + bandits[ind]['opp']))/100 
                


        
def agent(obs, conf):
    """Main agent"""
    global last_reward, bandits, probabs, patterns, opp_actions, my_actions, arr

    if obs.step == 0:
        return random.randint(0, 99)
    
    update(obs) #Update all the information

    compute(obs) #Update threshold of the bandit pulled previously

    if obs.reward > last_reward: #Re-pull if rewarded
        last_reward = obs.reward
        return obs.lastActions[obs.agentIndex]

    last_reward = obs.reward
    
    #Max threshold and the bandit
    maxpred, bnd = max(probabs), int(numpy.argmax(probabs))
    
    if obs.step > 3: #Repeat if already repeated last few steps, decrease significance as game progresses
        if opp_actions[-1] == opp_actions[-2] and opp_actions[-1] == opp_actions[-3] and random.random() < 0.6 - 0.6*(obs.step/2000):
            return opp_actions[-1]

        if my_actions[-1] == my_actions[-2] and my_actions[-1] == my_actions[-3] and random.random() < 0.6 - 0.6*(obs.step/2000):
            return my_actions[-1]
        
    if obs.step >= 1995:
        numpy.save('arr.npy', arr)
    #Pull the bandit with the max expected threshold    
    return bnd
    

In [None]:
%%writefile thompson.py

import numpy as np
import pandas as pd

post_a = None
post_b = None
bandit = None
total_reward = 0
c = 3
df_steps = []
df_probs = []

def agent(observation, configuration):
    global reward_sums, total_reward, bandit, post_a, post_b, c
    global df_steps, df_probs
    
    n_bandits = configuration.banditCount

    if observation.step == 0:
        post_a = np.ones(n_bandits)
        post_b = np.ones(n_bandits)
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward

        # Update Gaussian posterior
        post_a[bandit] += r
        post_b[bandit] += (1 - r)

    samples = np.random.beta(post_a, post_b)
    bandit = int(np.argmax(samples))
    
    df_steps.append(observation.step)
    df_probs.append(samples[bandit] * 100)
    # if observation.step > 1995:
    #     df = pd.DataFrame(df_probs, index=df_steps, columns=['probs'])
    #     df.to_csv("thompson.csv")
        
    return bandit


In [None]:
%%writefile vegas_pull.py

import numpy as np
import pandas as pd
import random, os, datetime, math
from collections import defaultdict

total_reward = 0
bandit_dict = {}

def set_seed(my_seed=42):
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit():
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*1.5 + bandit_dict[bnd]['op_continue']) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp']) \
                * math.pow(0.97, bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    return best_bandit

my_action_list = []
op_action_list = []

op_continue_cnt_dict = defaultdict(int)

def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict

    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0, 'my_continue': 0, 'op_continue': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] = bandit_dict[my_last_action]['win'] +1
        else:
            bandit_dict[my_last_action]['loss'] = bandit_dict[my_last_action]['loss'] +1
        bandit_dict[op_last_action]['opp'] = bandit_dict[op_last_action]['opp'] +1
        
        if observation['step'] >= 3:
            if my_action_list[-1] == my_action_list[-2]:
                bandit_dict[my_last_action]['my_continue'] += 1
            else:
                bandit_dict[my_last_action]['my_continue'] = 0
            if op_action_list[-1] == op_action_list[-2]:
                bandit_dict[op_last_action]['op_continue'] += 1
            else:
                bandit_dict[op_last_action]['op_continue'] = 0
        
        if last_reward > 0:
            my_pull = my_last_action
        else:
            if observation['step'] >= 4:
                if (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]):
                    if random.random() < 0.5:
                        my_pull = my_action_list[-1]
                    else:
                        my_pull = get_next_bandit()
                else:
                    my_pull = get_next_bandit()
            else:
                my_pull = get_next_bandit()
    
    return my_pull

In [None]:
%%writefile bayesian_ucb.py

import numpy as np
from scipy.stats import beta
import pandas as pd

post_a, post_b, bandit = [None] * 3
total_reward = 0
c = 3
df_steps = []
df_probs = []

def agent(observation, configuration):
    global total_reward, bandit, post_a, post_b, c
    global df_steps, df_probs
    if observation.step == 0:
        post_a, post_b = np.ones((2, configuration.banditCount))
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward
        # Update Gaussian posterior
        post_a[bandit] += r
        post_b[bandit] += 1 - r
    
    bound = post_a / (post_a + post_b) + beta.std(post_a, post_b) * c
    bandit = int(np.argmax(bound))
    
    df_probs.append(100 * post_a[bandit]/(post_a[bandit] + post_b[bandit]))
    df_steps.append(observation.step)
    
    # if observation.step > 1995:
    #     df = pd.DataFrame(df_probs, index=df_steps, columns=['probs'])
    #     df.to_csv("bayesian_ucb.csv")
    return bandit

In [None]:
!pip install kaggle-environments --upgrade -q
from kaggle_environments import make
env = make("mab", debug=True)

# <center>Against Bayesian UCB</center>

In [None]:
import matplotlib.pyplot as plt
import numpy as np

game = env.run(['submission.py', 'bayesian_ucb.py'])

bests = [max(game[k][0]['observation']['thresholds']) for k in range(2000)]

fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 12))

actual_regr_op = []
actual_regr_me = []
for i in range(2000):
    actual_regr_op.append(game[i][0]['observation']['thresholds'][game[i][1]['action']])
    actual_regr_me.append(game[i][0]['observation']['thresholds'][game[i][0]['action']])

ax1.set_title('Thresholds of the bandits chosen every step')
ax1.scatter([i for i in range(2000)], actual_regr_op, alpha=0.7,label='Op')
ax1.scatter([i for i in range(2000)], actual_regr_me, alpha=0.7, label='Me')
ax1.plot(bests, color='black', label='Max Threshold')
ax1.legend()

expect = np.load('arr.npy')

ax2.set_title('Expected vs Actual Thresholds of the chosen bandits')
ax2.scatter([i for i in range(len(expect))], [actual_regr_me[i] for i in range(len(expect))], alpha=0.7, label='Actual')
ax2.scatter([i for i in range(len(expect))], [expect[i]*100 for i in range(len(expect))], alpha=0.7, label='Expected')
ax2.legend()

# <center>Against Thompson Sampling</center>

In [None]:
import matplotlib.pyplot as plt
import numpy as np

game = env.run(['submission.py', 'thompson.py'])

bests = [max(game[k][0]['observation']['thresholds']) for k in range(2000)]

fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 12))

actual_regr_op = []
actual_regr_me = []
for i in range(2000):
    actual_regr_op.append(game[i][0]['observation']['thresholds'][game[i][1]['action']])
    actual_regr_me.append(game[i][0]['observation']['thresholds'][game[i][0]['action']])

ax1.set_title('Thresholds of the bandits chosen every step')
ax1.scatter([i for i in range(2000)], actual_regr_op, alpha=0.7,label='Op')
ax1.scatter([i for i in range(2000)], actual_regr_me, alpha=0.7, label='Me')
ax1.plot(bests, color='black', label='Max Threshold')
ax1.legend()

expect = np.load('arr.npy')

ax2.set_title('Expected vs Actual Thresholds of the chosen bandits')
ax2.scatter([i for i in range(len(expect))], [actual_regr_me[i] for i in range(len(expect))], alpha=0.7, label='Actual')
ax2.scatter([i for i in range(len(expect))], [expect[i]*100 for i in range(len(expect))], alpha=0.7, label='Expected')
ax2.legend()

# <center>Against Vegas-Pull v2</center>

In [None]:
import matplotlib.pyplot as plt
import numpy as np

game = env.run(['submission.py', 'vegas_pull.py'])

bests = [max(game[k][0]['observation']['thresholds']) for k in range(2000)]

fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 12))

actual_regr_op = []
actual_regr_me = []
for i in range(2000):
    actual_regr_op.append(game[i][0]['observation']['thresholds'][game[i][1]['action']])
    actual_regr_me.append(game[i][0]['observation']['thresholds'][game[i][0]['action']])

ax1.set_title('Thresholds of the bandits chosen every step')
ax1.scatter([i for i in range(2000)], actual_regr_op, alpha=0.7,label='Op')
ax1.scatter([i for i in range(2000)], actual_regr_me, alpha=0.7, label='Me')
ax1.plot(bests, color='black', label='Max Threshold')
ax1.legend()

expect = np.load('arr.npy')

ax2.set_title('Expected vs Actual Thresholds of the chosen bandits')
ax2.scatter([i for i in range(len(expect))], [actual_regr_me[i] for i in range(len(expect))], alpha=0.7, label='Actual')
ax2.scatter([i for i in range(len(expect))], [expect[i]*100 for i in range(len(expect))], alpha=0.7, label='Expected')
ax2.legend()

We can all agree that the performances were quite bad, but that more has to do with the bad rule-based agent, and not the predictor, or maybe I just like to think so. I tinkered around with it quite a bit, but unfortunately couldn't come up with an agent that has a good win-rate against other famous public agents, so I didn't really use it in the contest. Still, I think this is worth looking into, and I hope if not me, someone else can come up with the proper way to use it.

Have a nice day!