In [None]:
!pip install kaggle-environments --upgrade
from kaggle_environments import make
import numpy as np
import tqdm as tqdm

In [None]:
%%writefile submission.py

#Simple agent that tries to estimate the a priori probability of success and applies decay

import random
import numpy as np
import math

tries = None
success = None
baseprobs = None
modifiers = None

#Function to update the a priori probability of candy per machine
def update_probs(probs,mods,trs,suc,nsamples=3):
    #Based on the seen number of successes we can estimate the a priori probability
    #as the number of successes divided by the decays of all the tries
    modsum = np.sum(mods*trs,axis=1)
    est_probs = np.minimum(1.0,success / np.maximum(0.000001,modsum))
    #Weighted combination between the initial random probability (probs=0.5) and the estimated noisy probability
    #When number of tries per machine >= nsamples, only the estimated probabilities are used
    trsum = np.sum(trs,axis=1)
    weights = np.minimum(1.0,trsum / nsamples)
    probs = (probs * (1 - weights)) + (est_probs * weights)
    return probs

def agent(observation, configuration,lambd=10):
    global tries, success, baseprobs, modifiers
    if observation.step == 0:
        #Initial random probabilities (0.5)
        baseprobs = 0.5 * np.ones((configuration.banditCount))
        #Modifiers will be stored for each step
        modifiers = np.ones((configuration.banditCount,configuration.episodeSteps))
        #Number of successes
        success = np.zeros((configuration.banditCount,))
        #Number of agent tries
        tries = np.zeros((configuration.banditCount,configuration.episodeSteps))
        #For step 0, use the initial random probs
        probs = 0.5 * np.ones((configuration.banditCount))
    else:
        #Add an agent try
        tries[observation.lastActions[observation.agentIndex],observation.step-1] = 1
        #Add an agent success (if any)
        success[observation.lastActions[observation.agentIndex]] += (observation.reward - np.sum(success))
        #Update modifiers for agent and opponent for next step
        modifiers[:,observation.step] = modifiers[:,observation.step-1]
        modifiers[observation.lastActions[0],observation.step] *= configuration.decayRate
        modifiers[observation.lastActions[1],observation.step] *= configuration.decayRate
        #Calculate a priori probabilities based on seen observations
        probs = update_probs(baseprobs,modifiers[:,:observation.step],tries[:,:observation.step],success)
    #Apply modifiers for the current step
    probs = probs * modifiers[:,observation.step]
    #Sort from more probable to less probable
    sorted_vals = np.argsort(probs)[::-1]
    #The exponential distribution favours the higher probability machines, but allocates some probability to the rest
    idx = min(configuration.banditCount-1,math.floor(np.random.exponential(1/lambd)*configuration.banditCount))
    #Next machine
    next_val = sorted_vals[idx]
    return int(next_val)

In [None]:
#Evaluate against the provided baseline
env = make("mab", debug=True)
scores = []
for k in tqdm.tqdm(range(50)):
    env.reset()
    state = env.run(["submission.py","../input/santa-2020/submission.py"])
    scores.append((state[-1][0]['observation']['reward'],state[-1][1]['observation']['reward']))
print('Victories: {0:d} of {1:d}'.format(sum([s[0]>s[1] for s in scores]),len(scores)))
print('Average points: {0:.0f}'.format(np.mean([s[0] for s in scores])))
print('Average winning difference: {0:.0f}'.format(np.mean([(s[0]-s[1]) for s in scores if (s[0]>s[1])])))
print('Average losing difference: {0:.0f}'.format(np.mean([(s[1]-s[0]) for s in scores if (s[1]>s[0])])))