## In the hope of learning some new tricks, I am sharing my current thinking and code. 

The code is commented, but let me know if you have any question ! :-)

In particular, I am keen to understand:
- why the discrepancy between the way I compute the UCB bound and the Bayesian UCB starter notebook
- ideas on how to better incorporate the opponent's actions (ie improving the opp_quality metric)
- tuning of all parameters

**Feedback and ideas for improvement welcome.**

In [None]:
!pip install kaggle-environments --upgrade -q

In [None]:
from kaggle_environments import make
env = make("mab", debug=True)

In [None]:
%%writefile skanderovitch.py

# IMPORT

import numpy as np
from scipy.stats import beta

# AGENT CONFIG


# How do we evaluate the bandit performance ? 
est_method = ['mean','thompson','ucb'][2]      # choosing UCB here
est_ucb_percentile = 0.75                      # percentile for UCB : higher is more optimistic

# Having evaluated the bandits, how do we select a candidate (exploration / exploitation) ?
pick_method = ['epsilon','greedy','weighted','random','stupid'][0]    # choosing epsilon greedy here
pick_weighted_alpha = 1                                               # for the weighted sampling, higher is greedier
pick_epsilon = 0.5                                                    # for epsilon-greedy, higher is more exploration
pick_epsilon_decay = 0.997                                            # lower means we shift to exploitation faster

# How can we use the information provided by the actions of the other / rival bot?
min_opp_quality = 0.2     # higher means we believe the other bot knows what they are doing, even if they seem to play poorly
opp_retry_factor = 1.     # higher means we care about exploiting a bandit found by the other bot (stealing) more than a bandit we identified

# GAME CONFIG

decay = 0.97
n_levers = None

# Global variables, will be explained below, as they are initialised
my_score = 0      # keep track of my score
beta_a = None
beta_b = None
my_pulls = None
opp_pulls = None
all_pulls = None
opp_quality = 0   # this will reflect how well we believe the other bot is playing


# EXECUTION

def logic(observation, configuration):
    
    global n_levers,my_score, beta_a, beta_b, my_pulls, opp_pulls,all_pulls, pick_epsilon
    
    # FIRST ROUND ?
    
    if observation.step == 0: 
        
        # We initialise the global vars
        
        n_levers = configuration.banditCount   # Number of bandits
        
        beta_a = np.ones(n_levers)             # Beta distribution with parameters (1,1),
        beta_b = np.ones(n_levers)             # means we have a uniform prior on the probability of each bandit
        
        my_pulls = np.zeros(n_levers)          # We keep track of how many times we pull each bandit
        opp_pulls = np.zeros(n_levers)         # Same for the rival bot
        all_pulls = np.zeros(n_levers)         # Same across both bots
        
    else:  

        # We update our knowledge
        
        my_choice,opp_choice = get_actions(observation)  # what did we each play at the previous round ?
        my_reward = compute_reward(observation)          # did I get a reward ?
        
        beta_a[my_choice] += my_reward                   # we compute the posterior distribution,
        beta_b[my_choice] += 1 - my_reward               # ignoring the decay for now (dealt with later)
        
        my_pulls[my_choice] += 1                         # Update how many times the bandits were pulled
        opp_pulls[opp_choice] += 1
        all_pulls[my_choice] += 1
        all_pulls[opp_choice] += 1
        
    a,b = merge_all_info()                               # What is the best estimate we can get for the distribution of each bandit
                                                         # using both what we learnt from the rewards, but also the rival bot's actions
    my_est = compute_est(a,b)                            # We sample an estimate for each bandit from these distributions
    decayed_est = my_est * decay**all_pulls              # We decay the estimates based on how many times the bandits were used
    my_choice = pick_bandit(decayed_est)                 # We pick one using the chosen strategy ()

    pick_epsilon  *= pick_epsilon_decay                  # We progressively favour exploitation vs exploration    
    
    return int(my_choice)


# MECHANICS    


def compute_est(a,b):
    # Given some distributions for each bandit, how do we compute the estimate ?

    if est_method == 'thompson':      # we sample from the distribution
        return np.random.beta(a, b)
    elif est_method == 'ucb':         # we pick the value at percentile X
        
        # Note : the Bayesian UCB sampler template in the competition is written as 
        # post_a / (post_a + post_b) + beta.std(post_a, post_b) * c
        # which I don't understand (eg could give values >> 1 for c large enough)
        # if anyone can explain, please let me know
        # I am using the PPF here, as it makes more sense to me
        
        return beta.ppf(est_ucb_percentile,a,b)
    elif est_method == 'mean':        # we pick the mean, this ignores the uncertainty
        return a / (a + b)


def pick_bandit(est_prob,pick_method=pick_method):
    # Given some estimes, how do we pick our candidate ?
    
    if pick_method == 'greedy':          # always pick the highest
        return int(np.argmax(est_prob))
    elif pick_method == 'epsilon':       # same, but sometimes explore
        if np.random.random() < pick_epsilon:
                                         # exploration is done via the weighted method
            return pick_bandit(est_prob,pick_method='weighted')  
        else:                            # default to greedy 
            return pick_bandit(est_prob,pick_method='greedy')
    elif pick_method == 'weighted':      # we will pick high estimates more often than low ones
        p = est_prob**pick_weighted_alpha
        p = p / p.sum()
        return np.random.choice(range(len(est_prob)),p=p)
    elif pick_method == 'random':        # pure random
        return np.random.choice(range(len(est_prob)))
    elif pick_method == 'stupid':        # always pick the lowest / worst
        return int(np.argmin(est_prob))



    

# INCORPORATE OPPONENT INFORMATION

def compute_opp_quality():
    # How well is the opponent playing ?
    # Should we use their choices to inform our knowledge ?
    
    global opp_quality
    # What do we independly believe about the bandits, based on what we observed ?
    indep_est = compute_est(beta_a,beta_b)
    # How well do the rival's actions correlate with our knowledge ?
    # Ie did they pull the right bandits ?
    opp_quality = np.corrcoef(opp_pulls,indep_est)[0,1]
    # Note : this can be improved, as it ignores
    # - what they can not know, ie the bandits they never pulled
    # - the decay of the bandits, ie they may have pulled lots from a bandit that is now very low probability

def merge_all_info():
    # How do we bring together
    # - what we observed
    # - what we can infer from the rival's actions ?
    
    # How good / believable is the opponent ?
    compute_opp_quality()
    # We will use their information based on :
    # - our estimate of the opponent quality
    # - a minimum value (to give them the benefit of doubt, esp early in the game)
    # - how much we prefer to steal / ruin their bandits vs exploiting the ones we found
    opp_retry_value = max(min_opp_quality,opp_quality) * opp_retry_factor
    
    # The good bandits discovered by the rival are identified by them playing more than once
    opp_wins = np.maximum(opp_pulls-1,0)
    opp_losses = opp_pulls - opp_wins
    
    # we combine our estimate with the additional information
    a = beta_a + opp_wins*opp_retry_value
    b = beta_b + opp_losses*opp_retry_value
    return a,b




####### BORING


def compute_reward(observation):
    global my_score
    reward = observation.reward - my_score
    my_score = observation.reward
    return reward

def get_actions(obs):
    opponentIndex = 1 - obs.agentIndex
    oppAction = obs.lastActions[opponentIndex]
    myAction = obs.lastActions[obs.agentIndex]
    return myAction,oppAction


def agent(observation, configuration):
    # just because this needs to be last
    return logic(observation, configuration)





# Quick run

In [None]:
env.run(["skanderovitch.py", "skanderovitch.py"])
env.render(mode="ipython", width=800, height=500)

# Tuning of parameters

In [None]:
import numpy as np
import pandas as pd
import sys
from scipy.stats import beta
from tqdm import tqdm_notebook as tqdm

####### GAME MECHANICS : this implements the same as the above

def pull(lever,game):
    prob = game['true_prob'][lever]
    game['true_prob'][lever] *= game['decay']
    reward = np.random.random() < prob
    return reward
    
def pick_bandit(est_prob,pick_method,alpha,epsilon):
    if pick_method == 'greedy':
        return int(np.argmax(est_prob))
    elif pick_method == 'epsilon':
        if np.random.random() < epsilon:
            return pick_bandit(est_prob,pick_method='weighted',alpha=alpha,epsilon=epsilon)
        else:
            return pick_bandit(est_prob,pick_method='greedy',alpha=alpha,epsilon=epsilon)
    elif pick_method == 'weighted':
        p = est_prob**alpha
        p = p / p.sum()
        return np.random.choice(range(len(est_prob)),p=p)
    elif pick_method == 'random':
        return np.random.choice(range(len(est_prob)))
    elif pick_method == 'stupid':
        return int(np.argmin(est_prob))

def opp_choose(game):
    return pick_bandit(game['true_prob'],game['opp_pick_method'],game['opp_alpha'],game['opp_epsilon'])

####### ESTIMATION

def compute_opp_quality(game):
    indep_est = compute_est(game['beta_a'],game['beta_b'],game)
    return np.corrcoef(game['opp_pulls'],indep_est)[0,1]

def compute_est(a,b,game):
    if game['est_method'] == 'thomson':
        return np.random.beta(a, b)
    elif game['est_method'] == 'ucb':
        return beta.ppf(game['est_ucb_percentile'],a,b)
    elif game['est_method'] == 'mean':
        return a / (a + b)
        
def decay_est(est,game):
    return est * game['decay']**game['all_pulls']


def merge_all_info(game):
    game['opp_quality'] = compute_opp_quality(game)
    opp_retry_value = max(game['min_opp_quality'],game['opp_quality'])* game['opp_retry_factor']
    
    opp_wins = np.maximum(game['opp_pulls']-1,0)
    opp_losses = game['opp_pulls'] - opp_wins
    a = game['beta_a'] + opp_wins*opp_retry_value
    b = game['beta_b'] + opp_losses*opp_retry_value
    return a,b


#### SIMULATION

def simulate(game):
    try:
        game['original_epsilon'] = game['pick_epsilon']

        for r in range(game['n_rounds']):
            opp_choice = opp_choose(game)

            opp_reward = pull(opp_choice,game)
            game['opp_score'] += opp_reward


            a,b = merge_all_info(game)
            game['my_est'] = compute_est(a,b,game)
            game['decayed_est'] = decay_est(game['my_est'],game)
            my_choice = pick_bandit(game['decayed_est'],game['pick_method'],game['pick_weighted_alpha'],game['pick_epsilon'])

            my_reward = pull(my_choice,game)
            game['my_score'] += my_reward
            if my_reward:
                game['beta_a'][my_choice] += 1
            else:
                game['beta_b'][my_choice] += 1

            game['my_pulls'][my_choice] += 1
            game['opp_pulls'][opp_choice] += 1
            game['all_pulls'][my_choice] += 1
            game['all_pulls'][opp_choice] += 1

            game['pick_epsilon'] *= game['pick_epsilon_decay']
    
    except:
        print('\nError',sys.exc_info())
        print(game)
        
    return game

In [None]:
def run_sim():
    
    ### CREATE A RANDOM CONFIGURATION FOR OUR AGENT AND THE OPPONENT
    
    game = dict()

    game['n_levers'] = 80
    game['n_rounds'] = 2000
    game['decay'] = 0.97
    game['my_score'] = 0
    game['opp_score'] = 0
    game['beta_a'] = np.ones(game['n_levers'])
    game['beta_b'] = np.ones(game['n_levers'])
    game['my_pulls'] = np.zeros(game['n_levers'])
    game['opp_pulls'] = np.zeros(game['n_levers'])
    game['all_pulls'] = np.zeros(game['n_levers'])
    game['true_prob'] = np.random.random(size=game['n_levers'])


    est_method = np.random.choice(['ucb','thomson','mean'])
    est_ucb_percentile = (0.25 + np.random.random()*0.75) if est_method == 'ucb' else None
    pick_method = np.random.choice(['weighted','greedy','epsilon','random','stupid'])
    pick_weighted_alpha = (1 + np.random.random()*5) if pick_method in ['weighted','epsilon'] else None
    pick_epsilon = np.random.random() if pick_method in ['weighted','epsilon'] else -1
    pick_epsilon_decay = np.random.random() if pick_method in ['weighted','epsilon'] else -1
    min_opp_quality = np.random.random()
    opp_retry_factor = np.random.random()*2
    opp_pick_method = np.random.choice(['weighted','greedy','epsilon','random','stupid'])
    opp_alpha = (1 + np.random.random()*3) if opp_pick_method in ['weighted','epsilon'] else None
    opp_epsilon = np.random.random() if opp_pick_method in ['weighted','epsilon'] else None


    config = dict(zip('est_method,est_ucb_percentile,pick_method,pick_weighted_alpha,pick_epsilon,pick_epsilon_decay,min_opp_quality,opp_retry_factor,opp_pick_method,opp_alpha,opp_epsilon'.split(','),
                      (est_method,est_ucb_percentile,pick_method,pick_weighted_alpha,pick_epsilon,pick_epsilon_decay,min_opp_quality,opp_retry_factor,opp_pick_method,opp_alpha,opp_epsilon)))
    game.update(config)


    game = simulate(game)


    game['score'] = -game['opp_score']+game['my_score']

    return game
    

n_samples = 2000
samples = []
    
for _ in tqdm(range(n_samples)):
    samples.append(run_sim())
    


In [None]:
data = pd.DataFrame(samples).sort_values('score',ascending=False)['score,my_score,opp_score,est_method,est_ucb_percentile,pick_method,pick_weighted_alpha,original_epsilon,pick_epsilon_decay,min_opp_quality,opp_pick_method,opp_alpha,opp_epsilon'.split(',')]

to_study = data[~data['opp_pick_method'].isin(['stupid','random']) & ~data['pick_method'].isin(['stupid','random'])]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def analyse(df,cat='box'):
    

    for col in df.columns[3:]:
        n_unique = df[col].nunique()
        if n_unique < 10:
            plt.figure(figsize=(16,5))
            if cat == 'box':
                ax = sns.boxplot(x=col, y="score", data=df)
            else:
                ax = sns.violinplot(x=col, y="score", data=df)
            ax.set_title(col)
        else:
            ax = sns.lmplot(x=col, y="score", data=df[df[col] != -1], height=5, aspect=2)
            plt.title(col)
        plt.show()


- Hard to tell which sampling method performs best (will dig deeper)
- A higher UCB percentile seems better
- Stupid and random perform worst as expected ;-)
- A lower alpha (less greedy) seems better
- Slower epsilon decay seems best

In [None]:
analyse(data,cat='violin')

# Restricting to non-random/stupid


In [None]:
analyse(to_study)