References:
* [Santa 2020 starter](https://www.kaggle.com/isaienkov/santa-2020-starter/): Re-used writefile magic command and make_env function for creating a simulation.  

In [None]:
!pip install kaggle-environments --upgrade -q

## Thompson Sampling

Based on Lilian's blog post: https://lilianweng.github.io/lil-log/2018/01/23/the-multi-armed-bandit-problem-and-its-solutions.html

In [None]:
%%writefile thompson.py

import numpy as np

post_a = None
post_b = None
bandit = None
total_reward = 0
c = 3 #3


def agent(observation, configuration):
    global reward_sums, total_reward, bandit, post_a, post_b, c
    
    n_bandits = configuration.banditCount

    if observation.step == 0:
        post_a = np.ones(n_bandits)
        post_b = np.ones(n_bandits)
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward

        # Update Gaussian posterior
        post_a[bandit] += r
        post_b[bandit] += (1 - r)

    samples = np.random.beta(post_a, post_b)
    bandit = int(np.argmax(samples))
    
    return bandit

## Simulations

In [None]:
from kaggle_environments import make

env = make("mab", debug=True)

In [None]:
env.run(["../input/santa-2020/submission.py", "thompson.py"])
env.render(mode="ipython", width=800, height=500)

In [None]:
%%writefile bayesian_ucb.py
import numpy as np
from scipy.stats import beta

post_a, post_b, bandit = [None] * 3
total_reward = 0
c = 3

def agent(observation, configuration):
    global total_reward, bandit, post_a, post_b, c

    if observation.step == 0:
        post_a, post_b = np.ones((2, configuration.banditCount))
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward
        # Update Gaussian posterior
        post_a[bandit] += r
        post_b[bandit] += 1 - r
    
    bound = post_a / (post_a + post_b) + beta.std(post_a, post_b) * c
    bandit = int(np.argmax(bound))
    
    return bandit

In [None]:
%%writefile ucb_decay.py
import numpy as np

decay = 0.97
total_reward = 0
bandit = None

def agent(observation, configuration):
    global reward_sums, n_selections, total_reward, bandit
    
    n_bandits = configuration.banditCount

    if observation.step == 0:
        n_selections, reward_sums = np.full((2, n_bandits), 1e-32)
    else:
        reward_sums[bandit] += decay * (observation.reward - total_reward)
        total_reward = observation.reward

    avg_reward = reward_sums / n_selections    
    delta_i = np.sqrt(2 * np.log(observation.step + 1) / n_selections)
    bandit = int(np.argmax(avg_reward + delta_i))

    n_selections[bandit] += 1

    return bandit

In [None]:
%%writefile epsilon_greedy_decay.py
import math
import random

epsilon = 0.1 #0.1

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(42)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward    

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                decay = 0.94 ** numbers_of_selections[i] #0.97
                upper_bound = decay * sums_of_reward[i] / numbers_of_selections[i]
            else:
                upper_bound = 1e300  #1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
%%writefile multi_armed.py

import json
import numpy as np
import pandas as pd

bandit_state = None
total_reward = 0
last_step = None
    
def multi_armed_bandit_agent (observation, configuration):
    global history, history_bandit

    no_reward_step = 0.1 #0.3
    decay_rate = 0.97 # how much do we decay the win count after each call #0.97
    
    global bandit_state,total_reward,last_step
        
    if observation.step == 0:
        # initial bandit state
        bandit_state = [[1,1] for i in range(configuration["banditCount"])]
    else:       
        # updating bandit_state using the result of the previous step
        last_reward = observation["reward"] - total_reward
        total_reward = observation["reward"]
        
        # we need to understand who we are Player 1 or 2
        player = int(last_step == observation.lastActions[1])
        
        if last_reward > 0:
            bandit_state[observation.lastActions[player]][0] += last_reward
        else:
            bandit_state[observation.lastActions[player]][1] += no_reward_step
        
        bandit_state[observation.lastActions[0]][0] = (bandit_state[observation.lastActions[0]][0] - 1) * decay_rate + 1
        bandit_state[observation.lastActions[1]][0] = (bandit_state[observation.lastActions[1]][0] - 1) * decay_rate + 1

#     generate random number from Beta distribution for each agent and select the most lucky one
    best_proba = -1
    best_agent = None
    for k in range(configuration["banditCount"]):
        proba = np.random.beta(bandit_state[k][0],bandit_state[k][1])
        if proba > best_proba:
            best_proba = proba
            best_agent = k
        
    last_step = best_agent
    return best_agent

In [None]:
%%writefile agent2.py
import math, random


history = {
    "turn": 0,
    "cnts": [0] * 100,
    "ocnts": [0] * 100,
    "hits": [0] * 100,
    "osteps": [0] * 100,
    "la": -1,
}

def agent2(observation, configuration):
    global history

    N = 100
    p = [4.60575, 0.000629018, 1.82229]
    ti = observation["step"]
    if ti == 0:
        pass
    else:
        la = history["la"]
        ola = sum(observation['lastActions']) - la
        history["osteps"][ola] = ti
        if sum(history["hits"]) < observation['reward']:
            history["hits"][la] += 1 / pow(0.97, history["cnts"][la] + history["ocnts"][la])
        history["cnts"][la] += 1
        history["ocnts"][ola] += 1

    tau = p[0] / (ti + 1) + p[1]
    ea = [0] * N
    hits = history["hits"]
    cnts = history["cnts"]
    ocnts = history["ocnts"]
    osteps = history["osteps"]

    tv = sorted([(-ocnts[i], osteps[i], i) for i in range(N)])
    ot = [0] * N
    for i in range(N):
        ot[tv[i][2]] = 99 - i

    for i in range(N):
        if cnts[i] == 0:
            if ocnts[i] > 1:
                ea[i] = math.exp(ot[i] / 100 * pow(0.97, ocnts[i]) / tau)
            else:
                ea[i] = math.exp(0.99 * pow(0.97, ocnts[i]) / tau)
        else:
            w = pow(cnts[i], p[2])
            wo = ocnts[i]
            if ocnts[i] < 2:
                wo = 0
            r = hits[i] / cnts[i]
            ro = ot[i] / 100
            ea[i] = math.exp((r * w + ro * wo) / (w + wo) * pow(0.97, cnts[i] + ocnts[i]) / tau)

    se = sum(ea)
    r = random.random() * se
    t = 0
    la = 99
    for i in range(N):
        t += ea[i]
        if t >= r:
            la = i
            break

    history["la"] = la
    return la

In [None]:
%%writefile agent3.py
import math, random


history = {
    "turn": 0,
    "cnts": [0] * 100,
    "ocnts": [0] * 100,
    "hits": [0] * 100,
    "osteps": [0] * 100,
    "la": -1,
}

NU=500 #500
epson=0.97 #0.97
def agent3(observation, configuration):
    global history

    N = 100
    p = [0.39918, 0.000138129, 1.23946]
    #p = [5.60575, 0.000629018, 1.82229]
    ti = observation["step"]
    if ti == 0:
        pass
    else:
        la = history["la"]
        ola = sum(observation['lastActions']) - la
        history["osteps"][ola] = ti
        if sum(history["hits"]) < observation['reward']:
            history["hits"][la] += 1 / pow(epson, history["cnts"][la] + history["ocnts"][la])
        history["cnts"][la] += 1
        history["ocnts"][ola] += 1

    tau = p[0] / (ti + 1) + p[1]
    ea = [0] * N
    hits = history["hits"]
    cnts = history["cnts"]
    ocnts = history["ocnts"]
    osteps = history["osteps"]

    tv = sorted([(-ocnts[i], osteps[i], i) for i in range(N)])
    ot = [0] * N
    for i in range(N):
        ot[tv[i][2]] = 99 - i

    for i in range(N):
        if cnts[i] == 0:
            if ocnts[i] > 1:
                ea[i] = math.exp(min(NU, ot[i] / 100 * pow(epson, ocnts[i]) / tau)) #0.97
            else:
                ea[i] = math.exp(min(NU, 0.99 * pow(epson, ocnts[i]) / tau)) #0.99
        else:
            w = pow(cnts[i], p[2])
            wo = ocnts[i]
            if ocnts[i] < 1: #2
                wo = 0
            r = hits[i] / cnts[i]
            ro = ot[i] / 100
            ea[i] = math.exp(min(NU, (r * w + ro * wo) / (w + wo) * pow(epson, cnts[i] + ocnts[i]) / tau)) #0.97

    se = sum(ea)
    r = random.random() * se
    t = 0
    la = 99 #99
    for i in range(N):
        t += ea[i]
        if t >= r:
            la = i
            break

    history["la"] = la
    return la

In [None]:
%%writefile fewa.py
import random
import numpy as np

# seed
random.seed(2020)
np.random.seed(2020)

# global vars
decay = .97
n_ag = None
history = None
last_a_ag = None
last_a_op = None
rewards = None
total_reward = 0
alpha = 0.1
delta0 = 1

# filter
def filter_step(k, h, t, delta_t, rewards, sigma2=1):
    # determine c
    c = np.sqrt((2 * sigma2 / (h + 1)) * np.log(1 / delta_t))
    # estimates
    mu = np.mean(rewards[(t - h - 1):t, k], axis=0)
    mu_max = np.max(mu)
#     print('c', c, 'mu_max', mu_max, 'len(k)', len(k))
    # filter
    delta_i = mu_max - mu.reshape(-1,)
    k_next = [i for i, di in zip(k, delta_i) if di <= 2 * c]
    return k_next

# agent
def agent(obs, conf):
    global n_ag, last_a_ag, last_a_op, rewards, total_reward, history

    # init
    t = obs.step
    if t == 0:
        # init
        n_ag = np.zeros(conf.banditCount, dtype=np.int)
        history = np.zeros(conf.banditCount)
        rewards = np.zeros(conf.banditCount)
        # take action
        action = int(obs.step)
        # update history
        hist_vector = 1 * (np.arange(conf.banditCount) == action)
        history = np.vstack((history, hist_vector))
    else:
        # get opps last action
        op_ix = (obs.agentIndex + 1) % len(obs.lastActions)
        last_a_op = obs.lastActions[op_ix]
        # update counts
        n_ag[last_a_ag] += int(1)
        #n_ag[last_a_op] += int(1)
        #print(n_ag)
        # update history
        hist_vector = 1 * (np.arange(conf.banditCount) == last_a_op)
        history = np.vstack((history, hist_vector))
        # reward
        r = (obs.reward - total_reward)
        r_vector =  r * (np.arange(conf.banditCount) == last_a_op)
        rewards = np.vstack((rewards, r_vector))
        total_reward = obs.reward

        # warmup
        if t < conf.banditCount:
            # take action
            action = int(obs.step)
        else:
            # FEWA algorithm
            # update delta
            delta_t = delta0 / ((t+1) ** alpha)
            # init
            h = int(0)
            k = list(range(conf.banditCount))
            it = None
            # loop
            while it is None:
                # filter
                k_next = filter_step(k, h, t, delta_t, rewards)
                k = k_next
                # increment
                h += int(1)
                # there exists any bandit that number of selected times is h?
                if any(n_ag[k] == h):
                    if (n_ag[k] == h).sum() > 1:
                        # breaks tie randomly
                        it = int(np.random.choice(np.array(k)[n_ag[k] == h]))
                    else:
                        ix = np.argmin(n_ag[k])
                        it = int(k[ix])
#             print('potential bandits', len(k))
#             print('action = ', it)
            action = it
    # update last action
    last_a_ag = action
    return action

In [None]:
%%writefile agent4.py
import random

moves_stack = [x for x in range(100)]
oppo = []
prev_reward = 0
prev_action = 0
total_bnd = [0 for x in range(100)]
won_bnd = [0 for x in range(100)]


def get_bandit():
  best_bandit = 0
  best_score = 0
  for bnd in range(100):

    if total_bnd[bnd] <= 3:
      return bnd

    this_score = (won_bnd[bnd] / total_bnd[bnd])
    
    if this_score > best_score:
      best_score = this_score
      best_bandit = bnd
  
  return best_bandit


def agent(obs, conf):
    global moves_stack, oppo, prev_reward, prev_action, total_bnd, won_bnd

    if obs.step == 1:
        prev_action = moves_stack.pop(0)
        
        total_bnd[prev_action] += 1
        return prev_action
    
    my_idx = obs['agentIndex']

    if obs.step > 5:
        oppo.append(obs['lastActions'][1-my_idx])

    reward_this_time = obs.reward - prev_reward
    prev_reward = obs.reward

    if reward_this_time > 0:
        moves_stack.insert(0, prev_action)
        won_bnd[prev_action] += 1      

    if len(oppo) >= 3:
        if oppo[-1] == oppo[-2] and oppo[-1] == oppo[-3]:
            moves_stack.insert(0, oppo[-1])
                

    if len(moves_stack) == 0:
         moves_stack.insert(0, get_bandit())
    
    prev_action = moves_stack.pop(0)
    
    total_bnd[prev_action] += 1
    return prev_action

In [None]:
%%writefile sirish.py
import numpy as np
import pandas as pd
import random, os, datetime

total_reward = 0
bandit_dict = {}
NUM=1.2 #1.5  #1.6_0.66  #1_0.46  #0.5_0.66   #0.25_0.73  #1.4_0.4  1.3_0.26

def set_seed(my_seed=44): #42  #44_0.33
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit():
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*NUM) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    return best_bandit

def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict

    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        if 0 < last_reward:
            bandit_dict[observation['lastActions'][my_idx]]['win'] = bandit_dict[observation['lastActions'][my_idx]]['win'] +1
        else:
            bandit_dict[observation['lastActions'][my_idx]]['loss'] = bandit_dict[observation['lastActions'][my_idx]]['loss'] +1
        bandit_dict[observation['lastActions'][1-my_idx]]['opp'] = bandit_dict[observation['lastActions'][1-my_idx]]['opp'] +1
        my_pull = get_next_bandit()
    
    return my_pull

In [None]:
%%writefile greedy_desion_tree.py
import pickle
import random

import numpy as np
import pandas as pd
import sklearn.tree as skt

# Parameters
FUDGE_FACTOR = 0.99
VERBOSE = False
DATA_FILE = '/kaggle/input/sample-training-data/training_data_201223.parquet'
TRAIN_FEATS = ['round_num', 'n_pulls_self', 'n_success_self', 'n_pulls_opp']
TARGET_COL = 'payout'


def make_model():
    """Builds a decision tree model based on stored trainingd data"""
    data = pd.read_parquet(DATA_FILE)
    model = skt.DecisionTreeRegressor(min_samples_leaf=40)
    model.fit(data[TRAIN_FEATS], data[TARGET_COL])
    return model


class GreedyStrategy:
    """Implements strategy to maximize expected value

    - Tracks estimated likelihood of payout ratio for each machine
    - Tracks number of pulls on each machine
    - Chooses machine based on maximum expected value
    
    
    """
    def __init__(self, name, agent_num, n_machines):
        """Initialize and train decision tree model

        Args:
           name (str):   Name for the agent
           agent_num (int):   Assigned player number
           n_machines (int):   number of machines in the game
        
        """
        # Record inputs
        self.name = name
        self.agent_num = agent_num
        self.n_machines = n_machines
        
        # Initialize distributions for all machines
        self.n_pulls_self = np.array([0 for _ in range(n_machines)])
        self.n_success_self = np.array([0. for _ in range(n_machines)])
        self.n_pulls_opp = np.array([0 for _ in range(n_machines)])

        # Track other players moves
        self.opp_moves = []
        
        # Track winnings
        self.last_reward_count = 0

        # Create model to predict expected reward
        self.model = make_model()
        
        # Predict expected reward
        features = np.zeros((self.n_machines, 4))
        features[:, 0] = len(self.opp_moves)
        features[:, 1] = self.n_pulls_self
        features[:, 2] = self.n_success_self
        features[:, 3] = self.n_pulls_opp
        self.predicts = self.model.predict(features)
        

    def __call__(self):
        """Choose machine based on maximum expected payout

        Returns:
           <result> (int):  index of machine to pull
        
        """
        # Otherwise, use best available
        est_return = self.predicts
        max_return = np.max(est_return)
        result = np.random.choice(np.where(
            est_return >= FUDGE_FACTOR * max_return)[0])
        
        if VERBOSE:
            print('  - Chose machine %i with expected return of %3.2f' % (
                int(result), est_return[result]))

        return int(result)
    
        
    def updateDist(self, curr_total_reward, last_m_indices):
        """Updates estimated distribution of payouts"""
        # Compute last reward
        last_reward = curr_total_reward - self.last_reward_count
        self.last_reward_count = curr_total_reward
        if VERBOSE:
            print('Last reward: %i' % last_reward)

        if len(last_m_indices) == 2:
            # Update number of pulls for both machines
            m_index = last_m_indices[self.agent_num]
            opp_index = last_m_indices[(self.agent_num + 1) % 2]
            self.n_pulls_self[m_index] += 1
            self.n_pulls_opp[opp_index] += 1

            # Update number of successes
            self.n_success_self[m_index] += last_reward
            
            # Update opponent activity
            self.opp_moves.append(opp_index)

            # Update predictions for chosen machines
            self.predicts[[opp_index, m_index]] = self.model.predict([
                [
                    len(self.opp_moves),
                    self.n_pulls_self[opp_index],
                    self.n_success_self[opp_index],
                    self.n_pulls_opp[opp_index]
                ],
                [
                    len(self.opp_moves),
                    self.n_pulls_self[m_index],
                    self.n_success_self[m_index],
                    self.n_pulls_opp[m_index]
                ]])
            

def agent(observation, configuration):
    global curr_agent
    
    if observation.step == 0:
        # Initialize agent
        curr_agent = GreedyStrategy(
            'Mr. Agent %i' % observation['agentIndex'],
            observation['agentIndex'],
            configuration['banditCount'])
    
    # Update payout ratio distribution with:
    curr_agent.updateDist(observation['reward'], observation['lastActions'])

    return curr_agent()

In [None]:
%%writefile santa3.py

import numpy as np
from scipy.stats import beta

# AGENT CONFIG


# How do we evaluate the bandit performance ? 
est_method = ['mean','thompson','ucb'][2]      # choosing UCB here
est_ucb_percentile = 0.75                      # percentile for UCB : higher is more optimistic

# Having evaluated the bandits, how do we select a candidate (exploration / exploitation) ?
pick_method = ['epsilon','greedy','weighted','random','stupid'][0]    # choosing epsilon greedy here
pick_weighted_alpha = 1                                               # for the weighted sampling, higher is greedier
pick_epsilon = 0.5                                                    # for epsilon-greedy, higher is more exploration
pick_epsilon_decay = 0.997                                            # lower means we shift to exploitation faster

# How can we use the information provided by the actions of the other / rival bot?
min_opp_quality = 0.2     # higher means we believe the other bot knows what they are doing, even if they seem to play poorly
opp_retry_factor = 1.     # higher means we care about exploiting a bandit found by the other bot (stealing) more than a bandit we identified

# GAME CONFIG

decay = 0.97
n_levers = None

# Global variables, will be explained below, as they are initialised
my_score = 0      # keep track of my score
beta_a = None
beta_b = None
my_pulls = None
opp_pulls = None
all_pulls = None
opp_quality = 0   # this will reflect how well we believe the other bot is playing


# EXECUTION

def logic(observation, configuration):
    
    global n_levers,my_score, beta_a, beta_b, my_pulls, opp_pulls,all_pulls, pick_epsilon
    
    # FIRST ROUND ?
    
    if observation.step == 0: 
        
        # We initialise the global vars
        
        n_levers = configuration.banditCount   # Number of bandits
        
        beta_a = np.ones(n_levers)             # Beta distribution with parameters (1,1),
        beta_b = np.ones(n_levers)             # means we have a uniform prior on the probability of each bandit
        
        my_pulls = np.zeros(n_levers)          # We keep track of how many times we pull each bandit
        opp_pulls = np.zeros(n_levers)         # Same for the rival bot
        all_pulls = np.zeros(n_levers)         # Same across both bots
        
    else:  

        # We update our knowledge
        
        my_choice,opp_choice = get_actions(observation)  # what did we each play at the previous round ?
        my_reward = compute_reward(observation)          # did I get a reward ?
        
        beta_a[my_choice] += my_reward                   # we compute the posterior distribution,
        beta_b[my_choice] += 1 - my_reward               # ignoring the decay for now (dealt with later)
        
        my_pulls[my_choice] += 1                         # Update how many times the bandits were pulled
        opp_pulls[opp_choice] += 1
        all_pulls[my_choice] += 1
        all_pulls[opp_choice] += 1
        
    a,b = merge_all_info()                               # What is the best estimate we can get for the distribution of each bandit
                                                         # using both what we learnt from the rewards, but also the rival bot's actions
    my_est = compute_est(a,b)                            # We sample an estimate for each bandit from these distributions
    decayed_est = my_est * decay**all_pulls              # We decay the estimates based on how many times the bandits were used
    my_choice = pick_bandit(decayed_est)                 # We pick one using the chosen strategy ()

    pick_epsilon  *= pick_epsilon_decay                  # We progressively favour exploitation vs exploration    
    
    return int(my_choice)


# MECHANICS    


def compute_est(a,b):
    # Given some distributions for each bandit, how do we compute the estimate ?

    if est_method == 'thompson':      # we sample from the distribution
        return np.random.beta(a, b)
    elif est_method == 'ucb':         # we pick the value at percentile X
        
        # Note : the Bayesian UCB sampler template in the competition is written as 
        # post_a / (post_a + post_b) + beta.std(post_a, post_b) * c
        # which I don't understand (eg could give values >> 1 for c large enough)
        # if anyone can explain, please let me know
        # I am using the PPF here, as it makes more sense to me
        
        return beta.ppf(est_ucb_percentile,a,b)
    elif est_method == 'mean':        # we pick the mean, this ignores the uncertainty
        return a / (a + b)


def pick_bandit(est_prob,pick_method=pick_method):
    # Given some estimes, how do we pick our candidate ?
    
    if pick_method == 'greedy':          # always pick the highest
        return int(np.argmax(est_prob))
    elif pick_method == 'epsilon':       # same, but sometimes explore
        if np.random.random() < pick_epsilon:
                                         # exploration is done via the weighted method
            return pick_bandit(est_prob,pick_method='weighted')  
        else:                            # default to greedy 
            return pick_bandit(est_prob,pick_method='greedy')
    elif pick_method == 'weighted':      # we will pick high estimates more often than low ones
        p = est_prob**pick_weighted_alpha
        p = p / p.sum()
        return np.random.choice(range(len(est_prob)),p=p)
    elif pick_method == 'random':        # pure random
        return np.random.choice(range(len(est_prob)))
    elif pick_method == 'stupid':        # always pick the lowest / worst
        return int(np.argmin(est_prob))



    

# INCORPORATE OPPONENT INFORMATION

def compute_opp_quality():
    # How well is the opponent playing ?
    # Should we use their choices to inform our knowledge ?
    
    global opp_quality
    # What do we independly believe about the bandits, based on what we observed ?
    indep_est = compute_est(beta_a,beta_b)
    # How well do the rival's actions correlate with our knowledge ?
    # Ie did they pull the right bandits ?
    opp_quality = np.corrcoef(opp_pulls,indep_est)[0,1]
    # Note : this can be improved, as it ignores
    # - what they can not know, ie the bandits they never pulled
    # - the decay of the bandits, ie they may have pulled lots from a bandit that is now very low probability

def merge_all_info():
    # How do we bring together
    # - what we observed
    # - what we can infer from the rival's actions ?
    
    # How good / believable is the opponent ?
    compute_opp_quality()
    # We will use their information based on :
    # - our estimate of the opponent quality
    # - a minimum value (to give them the benefit of doubt, esp early in the game)
    # - how much we prefer to steal / ruin their bandits vs exploiting the ones we found
    opp_retry_value = max(min_opp_quality,opp_quality) * opp_retry_factor
    
    # The good bandits discovered by the rival are identified by them playing more than once
    opp_wins = np.maximum(opp_pulls-1,0)
    opp_losses = opp_pulls - opp_wins
    
    # we combine our estimate with the additional information
    a = beta_a + opp_wins*opp_retry_value
    b = beta_b + opp_losses*opp_retry_value
    return a,b




####### BORING


def compute_reward(observation):
    global my_score
    reward = observation.reward - my_score
    my_score = observation.reward
    return reward

def get_actions(obs):
    opponentIndex = 1 - obs.agentIndex
    oppAction = obs.lastActions[opponentIndex]
    myAction = obs.lastActions[obs.agentIndex]
    return myAction,oppAction


def agent(observation, configuration):
    # just because this needs to be last
    return logic(observation, configuration)

In [None]:
%%writefile oppon.py
import numpy as np
import pandas as pd
import random, os, datetime, math

total_reward = 0
bandit_dict = {}

def set_seed(my_seed=42):
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit():
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*1.5) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp']) \
                * math.pow(0.97, bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])#0.97
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    return best_bandit

my_action_list = []
op_action_list = []



def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict

    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] = bandit_dict[my_last_action]['win'] +1
        else:
            bandit_dict[my_last_action]['loss'] = bandit_dict[my_last_action]['loss'] +1
        bandit_dict[op_last_action]['opp'] = bandit_dict[op_last_action]['opp'] +1
        
        if last_reward > 0:
            my_pull = my_last_action
        else:
            if observation['step'] >= 4:   #4
                if (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]):
                    if random.random() < 0.5: #0.5
                        my_pull = my_action_list[-1]
                    else:
                        my_pull = get_next_bandit()
                else:
                    my_pull = get_next_bandit()
            else:
                my_pull = get_next_bandit()
    
    return my_pull

In [None]:
%%writefile oppon1.py
import numpy as np
import pandas as pd
import random, os, datetime, math

total_reward = 0
bandit_dict = {}

fac1=1.44#1.44,1.45,1.5

def set_seed(my_seed=42):
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit():
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*fac1) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp']) \
                * math.pow(0.98, bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])#0.96,0.97
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    return best_bandit

my_action_list = []
op_action_list = []



def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict

    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] = bandit_dict[my_last_action]['win'] +1
        else:
            bandit_dict[my_last_action]['loss'] = bandit_dict[my_last_action]['loss'] +1
        bandit_dict[op_last_action]['opp'] = bandit_dict[op_last_action]['opp'] +1
        
        if last_reward > 0:
            my_pull = my_last_action
        else:
            if observation['step'] >= 4:   #4
                if (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]):
                    if random.random() < 0.54: #0.54, 0.53,0.5
                        my_pull = my_action_list[-1]
                    else:
                        my_pull = get_next_bandit()
                else:
                    my_pull = get_next_bandit()
            else:
                my_pull = get_next_bandit()
    
    return my_pull

In [None]:
%%writefile pull1.py
import numpy as np
import pandas as pd
import random, os, datetime, math
from collections import defaultdict

total_reward = 0
bandit_dict = {}

def set_seed(my_seed=42):
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit():
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*1.5 + bandit_dict[bnd]['op_continue']) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp']) \
                * math.pow(0.97, bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    return best_bandit

my_action_list = []
op_action_list = []

op_continue_cnt_dict = defaultdict(int)

def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict

    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0, 'my_continue': 0, 'op_continue': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] = bandit_dict[my_last_action]['win'] +1
        else:
            bandit_dict[my_last_action]['loss'] = bandit_dict[my_last_action]['loss'] +1
        bandit_dict[op_last_action]['opp'] = bandit_dict[op_last_action]['opp'] +1
        
        if observation['step'] >= 3:
            if my_action_list[-1] == my_action_list[-2]:
                bandit_dict[my_last_action]['my_continue'] += 1
            else:
                bandit_dict[my_last_action]['my_continue'] = 0
            if op_action_list[-1] == op_action_list[-2]:
                bandit_dict[op_last_action]['op_continue'] += 1
            else:
                bandit_dict[op_last_action]['op_continue'] = 0
        
        if last_reward > 0:
            my_pull = my_last_action
        else:
            if observation['step'] >= 4:
                if (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]):
                    if random.random() < 0.5:
                        my_pull = my_action_list[-1]
                    else:
                        my_pull = get_next_bandit()
                else:
                    my_pull = get_next_bandit()
            else:
                my_pull = get_next_bandit()
    
    return my_pull

In [None]:
%%writefile pull2.py
import numpy as np
import pandas as pd
import random, os, datetime, math
from collections import defaultdict

total_reward = 0
bandit_dict = {}

def set_seed(my_seed=40):
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit():
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*1.5 + bandit_dict[bnd]['op_continue']) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp']) \
                * math.pow(0.97, bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    return best_bandit

my_action_list = []
op_action_list = []

op_continue_cnt_dict = defaultdict(int)

def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict

    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0, 'my_continue': 0, 'op_continue': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] = bandit_dict[my_last_action]['win'] +1
        else:
            bandit_dict[my_last_action]['loss'] = bandit_dict[my_last_action]['loss'] +1
        bandit_dict[op_last_action]['opp'] = bandit_dict[op_last_action]['opp'] +1
        
        if observation['step'] >= 3: #3
            if my_action_list[-1] == my_action_list[-2]:
                bandit_dict[my_last_action]['my_continue'] += 1
            else:
                bandit_dict[my_last_action]['my_continue'] = 0
            if op_action_list[-1] == op_action_list[-2]:
                bandit_dict[op_last_action]['op_continue'] += 1
            else:
                bandit_dict[op_last_action]['op_continue'] = 0
        
        if last_reward > 0:
            my_pull = my_last_action
        else:
            if observation['step'] >= 4: #4
                if (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]):
                    if random.random() < 0.45: #0.5
                        my_pull = my_action_list[-1]
                    else:
                        my_pull = get_next_bandit()
                else:
                    my_pull = get_next_bandit()
            else:
                my_pull = get_next_bandit()
    
    return my_pull

In [None]:
%%writefile out2.py
import random
import numpy as np

EXPLORE_STEPS = 23 # count of repeats of random selection before start of main algorithm
FIRST_SELECTION = 2 

START_TAU = 30.92623082874138 
TAU_MULT = 0.9743893312107584


ROUNDS = 2000


c_arr = np.empty(ROUNDS) # array of coefs 1, 0.97, 0.97^2, ...
c_arr[0] = 1
for i in range(1, c_arr.size):
    c_arr[i] = c_arr[i-1]*0.97

x_arr = np.linspace(0, 100, 101) # net of predicted thresholds


def fast_choice(options, probs):
    x = random.random()#np.random.rand()
    cum = 0
    for i, p in enumerate(probs):
        cum += p
        if x < cum:
            return options[i]
    return options[-1]

def probsnorm(x):
    return x/x.sum()

def softmax(x, tau):
    x2 = x/tau
    e = np.exp(x2 - x2.max())
    return e/e.sum()




if True:   


    ROUNDS = 2000


    c_arr = np.empty(ROUNDS) # array of coefs 1, 0.97, 0.97^2, ...
    c_arr[0] = 1
    for i in range(1, c_arr.size):
        c_arr[i] = c_arr[i-1]*0.97

    x_arr = np.linspace(0, 100, 101) # net of predicted thresholds
    tau = START_TAU
 
    #@profile
    def get_sample_probs(array, probs, best_of):

        p = probsnorm(probs)# to probability form

        args = np.argsort(p)[-best_of:] # select best_of values with biggest probs

        # return array[np.random.choice(args, 1, p = softmax(p[args]))[0]]
        return array[fast_choice(args, probsnorm(p[args]))]
    
    def get_sample_softmax(array, probs):
        global tau
        tau *= TAU_MULT
        
        p = softmax(probs, tau)# to probability form

        # return array[np.random.choice(args, 1, p = softmax(p[args]))[0]]
        return fast_choice(array, p)
    

    cached_x = {}
    def get_floor_x(c):
        global cached_x
        if c in cached_x:
            return cached_x[c]
        
        arr = np.floor(x_arr * c_arr[c])
        cached_x[c] = arr
        return arr



    BANDITS = 100 # count of bandits

    bandits_counts = np.zeros(BANDITS, dtype = np.int16) # choices count for each bandit

    probs = np.ones((BANDITS, x_arr.size)) # matrix bandit*threshold probs

    bandits_indexes = np.arange(BANDITS)

    start_bandits = np.random.choice(bandits_indexes, int(BANDITS*EXPLORE_STEPS/3), replace = True) # just start random sequence of bandits selection before start of main algorithm



    my_last_action = 0
    #@profile
    def update_counts(act1, act2, my_reward):
        global bandits_counts, probs
        opp = [act != my_last_action for act in (act1, act2)]
        opp = (act1, act2)[opp[0]] if len(opp) > 0 else my_last_action

        mlt = get_floor_x(bandits_counts[my_last_action])/100

        if my_reward == 1:
            probs[my_last_action, :] *= mlt
        else:
            probs[my_last_action, :] *= 1 - mlt

        bandits_counts[my_last_action] += 1
        bandits_counts[opp] += 1
    #@profile
    def get_best_action():

        #inds = np.unravel_index(probs.argmax(), probs.shape)

        #return inds[0] # select best bandit


        #likeh = np.array([np.argmax(probs[i, :]) for i in range(BANDITS)])

        #likeh = np.array([x_arr[ind]*c_arr[b]*probs[bandit, ind]/probs[bandit, :].sum() for bandit, (ind, b) in enumerate(zip(likeh, bandit_counts))])

        likeh = np.array([get_sample_probs(get_floor_x(b), probs[bandit, :], FIRST_SELECTION) for bandit, b in enumerate(bandits_counts)])

        return get_sample_softmax(bandits_indexes, likeh)# if random.random() < PROB else random.randrange(BANDITS)    



    last_reward = 0
    #@profile
    def pasa_agent(observation, configuration):

        global BANDITS, start_bandits, bandits_counts, probs, last_reward, bandits_indexes, my_last_action

        if observation.step == 0:

            BANDITS = configuration.banditCount
            #print(f"there are {BANDITS} bandits")

            bandits_indexes = np.arange(BANDITS, dtype = np.int16)   

            start_bandits = np.random.choice(bandits_indexes, int(BANDITS*EXPLORE_STEPS/3), replace = True)

            bandits_counts = np.zeros(BANDITS, dtype = np.int16)

            probs = np.ones((BANDITS, x_arr.size))


            my_last_action = start_bandits[0]

        elif observation.step < start_bandits.size:

            update_counts(int(observation.lastActions[0]), int(observation.lastActions[1]), observation.reward - last_reward)

            my_last_action = start_bandits[observation.step]

        else:

            update_counts(int(observation.lastActions[0]), int(observation.lastActions[1]), observation.reward - last_reward)

            my_last_action = get_best_action()


        last_reward = observation.reward 
        my_last_action = int(my_last_action)

        return my_last_action

## 5-round comparison

In [None]:
def print_rounds(file1, file2, N=15):
    env = make("mab", debug=True)
    per=0

    for i in range(N):
        env.run([file1, file2])
        p1_score = env.steps[-1][0]['reward']
        p2_score = env.steps[-1][1]['reward']
        if (p1_score>p2_score):
            per+=1
        env.reset()
        print(f"Round {i+1}: {p1_score} - {p2_score}")
    print(f"Percentage={per/N}")   

In [None]:
print('Default vs Thompson Sampling')
#print_rounds("../input/santa-2020/submission.py", "thompson.py")

In [None]:
print('Bayesian UCB vs Thompson Sampling')
#print_rounds("../input/santa-2020-ucb-and-bayesian-ucb-starter/bayesian_ucb.py", "thompson.py")

In [None]:
print('epsilon-greedy+decay vs Thompson Sampling')
#print_rounds("../input/santa-2020-epsilon-greedy-starter/epsilon_greedy_decay.py", "thompson.py")
#print_rounds("epsilon_greedy_decay.py", "thompson.py")
#print_rounds("epsilon_greedy_decay.py", "../input/santa-2020/submission.py")

In [None]:
print('epsilon-greedy+decay vs Thompson Sampling')
#print_rounds("../input/santa-2020-epsilon-greedy-starter/epsilon_greedy_decay.py", "thompson.py")
#print_rounds("multi_armed.py", "thompson.py")
#print_rounds("multi_armed.py", "../input/santa-2020/submission.py")

In [None]:
print('epsilon-greedy+decay vs Thompson Sampling')
#print_rounds("../input/santa-2020-epsilon-greedy-starter/epsilon_greedy_decay.py", "thompson.py")
#print_rounds("agent2.py", "thompson.py")
#print_rounds("multi_armed.py", "../input/santa-2020/submission.py")

In [None]:
print('epsilon-greedy+decay vs Thompson Sampling')
#print_rounds("../input/santa-2020-epsilon-greedy-starter/epsilon_greedy_decay.py", "thompson.py")
#print_rounds("multi_armed.py", "agent3.py")
#print_rounds("agent3.py", "../input/santa-2020/submission.py")
#print_rounds("agent3.py", "thompson.py")
#print_rounds("oppon1.py", "oppon.py")
#print_rounds("oppon1.py", "pull1.py")