**A script evaluating several standard multi-arm bandit algorithms.**

I'm completely new to the multi-armed bandit problem, so I coded up a bunch of the standard algorithms and compared each to a random algorithm. 

In [None]:
import numpy as np

# elf pulling at random 
class basic_elf():
    def __init__(self):
        self.mu   = None
        self.win  = None
        self.loss = None
        self.id   = None
        self.tot_reward = 0 
    def set_pars(self,obs,conf):
        self.n_bandits = conf.banditCount 
        self.win  = np.ones(self.n_bandits)
        self.loss = np.ones(self.n_bandits)
        self.id   = obs.agentIndex
    def pull(self,obs,config):
        '''random elf'''
        return int(np.random.choice(config.banditCount))

# elf employing epsilon greedy algorithm
class epsilon_elf(basic_elf):
    def __init__(self,epsilon):
        super(epsilon_elf,self).__init__()
        self.epsilon = epsilon
    def pull(self,obs,conf):
        '''epsilon elf'''
        if obs.step == 0:
            self.set_pars(obs,conf)
        else:
            r = obs.reward - self.tot_reward
            self.tot_reward = obs.reward
            
            self.win[obs['lastActions'][self.id]]  += r
            self.loss[obs['lastActions'][self.id]] += 1-r
            
        mu = self.win/(self.win+self.loss);
        w=(self.epsilon/self.n_bandits)*np.ones(self.n_bandits)
        w[np.argmax(mu)] = 1-self.epsilon+self.epsilon/self.n_bandits
            
        return int(np.random.choice(self.n_bandits,1,p=list(w)))    
    

    
# elf employing Boltzmann algorithm
class boltzmann_elf(basic_elf):
    def __init__(self,invT):
        super(boltzmann_elf,self).__init__()
        self.invT = invT
    def pull(self,obs,conf):
        '''boltzmann elf'''
        if obs.step == 0:
            self.set_pars(obs,conf)
        else:
            r = obs.reward - self.tot_reward
            self.tot_reward = obs.reward
            
            self.win[obs['lastActions'][self.id]]  += r
            self.loss[obs['lastActions'][self.id]] += 1-r
            
        mu = self.win/(self.win+self.loss);w=np.exp(self.invT*mu)
            
        return int(np.random.choice(self.n_bandits,1,p=list(w/w.sum())))
    
# elf employing pursuit algorithm    
class pursuit_elf(basic_elf):
    def __init__(self,beta):
        super(pursuit_elf,self).__init__()
        self.beta = beta
    def pull(self,obs,conf):
        '''pursuit elf'''
        if obs.step == 0:
            self.set_pars(obs,conf)
            self.pi = np.ones(self.n_bandits)/self.n_bandits
        else:
            r = obs.reward - self.tot_reward
            self.tot_reward = obs.reward
            
            self.win[obs['lastActions'][self.id]]  += r
            self.loss[obs['lastActions'][self.id]] += 1-r
            
            mu = self.win/(self.win+self.loss);mu_max = int(np.argmax(mu));
            self.pi=self.pi- self.beta*self.pi; self.pi[mu_max]+=self.beta
        
        
        return int(np.random.choice(np.arange(self.n_bandits), 1, p=list(self.pi) ))


    
# elf employing reinforcement comparison algorithm
class reinforcement_elf(basic_elf):
    def __init__(self,alpha,beta):
        super(reinforcement_elf,self).__init__()
        self.alpha = alpha
        self.beta  = beta
        self.rbar  = 0.5
    def pull(self,obs,conf):
        '''reinforcement elf'''
        if obs.step == 0:
            self.set_pars(obs,conf)
            self.pi = (1/self.n_bandits)*np.ones(self.n_bandits)
        else:
            r = obs.reward - self.tot_reward
            self.tot_reward = obs.reward
            
            self.pi[obs['lastActions'][self.id]] +=self.beta*(r-self.rbar)      
            self.rbar += self.alpha*(r - self.rbar)
            
        w=np.exp(self.pi)
            
        return int(np.random.choice(self.n_bandits,1,p=list(w/w.sum())))                   

# elf employing UCB algorithm
class UCB_elf(basic_elf):
    def __init__(self):
        super(UCB_elf,self).__init__()
    def pull(self,obs,conf):
        '''boltzmann elf'''
        if obs.step == 0:
            self.set_pars(obs,conf)
        else:
            r = obs.reward - self.tot_reward
            self.tot_reward = obs.reward
            
            self.win[obs['lastActions'][self.id]]  += r
            self.loss[obs['lastActions'][self.id]] += 1-r
            
        pulls = self.win+self.loss           
        mu = self.win/(self.win+self.loss);
        
        return int(np.argmax( mu + np.sqrt(2*np.log(1+obs.step)/pulls) ))
                   
                   

In [None]:
! pip install kaggle-environments --upgrade -q
from kaggle_environments import evaluate, make, utils

env = make("mab", debug=True)

In [None]:
import matplotlib.pyplot as plt

def baseline(basic,name,agents,names,reps):
    
    if len(agents)!=len(names):
        print('Number of agents do not correspond to number of names.')
    
    means = np.zeros([len(names),2])
    mins = np.zeros([len(names),2])
    maxes = np.zeros([len(names),2])
    
    for i, agenti in enumerate(agents):
        results = np.zeros([reps,2])
        for j in range(reps):
            
            env.reset()
            #print('run agent')
            env.run([basic, agenti])
            #print('save agent')
            json = env.toJSON()
            rewards = json['rewards']
            results[j,0] = rewards[0]
            results[j,1] = rewards[1] 
        means[i] = np.mean(results,axis=0)
        mins[i] = np.min(results,axis=0)
        maxes[i] = np.max(results,axis = 0)
    
    plus = maxes -means;minus = means - mins

    bars0 = np.vstack((minus.T[0],plus.T[0]))
    bars1 = np.vstack((minus.T[1],plus.T[1]))
 
    plt.errorbar(np.arange(len(names)),means.T[0],bars0,marker = 'o',ls = '',label = 'Random')
    plt.errorbar(np.arange(len(names)),means.T[1],bars1,marker = 'o',ls = '',label = 'Algorithm')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.xticks(np.arange(len(names)),names,rotation = 'vertical')
    plt.title(f'Random selection vs each algorithm over {reps} runs')
    plt.ylabel('Min/Mean/Max Reward')
    
    plt.show()

        
    return mins,maxes, means

In [None]:
random = basic_elf()
epsilon001 = epsilon_elf(0.01)
epsilon01 = epsilon_elf(0.1)
epsilon02 = epsilon_elf(0.2)
boltzmann01 = boltzmann_elf(.1)
boltzmann1 = boltzmann_elf(1)
boltzmann2 = boltzmann_elf(2)
boltzmann10 = boltzmann_elf(10) 
pursuit001 = pursuit_elf(0.01)
pursuit01 = pursuit_elf(0.1)
pursuit02 = pursuit_elf(0.2)
reinforcement001001 = reinforcement_elf(0.01,0.01)
reinforcement01001 = reinforcement_elf(0.1,0.01)
reinforcement00101 = reinforcement_elf(0.01,0.1)
reinforcement0101 = reinforcement_elf(0.1,0.1)
reinforcement0201 = reinforcement_elf(0.2,0.1)
reinforcement0102 = reinforcement_elf(0.1,0.2)
reinforcement0202 = reinforcement_elf(0.2,0.2)
UCB = UCB_elf()
agents = [
          epsilon001.pull,epsilon01.pull, epsilon02.pull, 
          boltzmann01.pull,boltzmann1.pull,boltzmann2.pull,boltzmann10.pull,  
          pursuit001.pull,pursuit01.pull,pursuit02.pull,
          reinforcement001001.pull,
          UCB.pull]
names = [
         'greedy, (e = 0.01)','greedy, (e = 0.1)','greedy, (e = 0.2)',
         'boltzmann, (invT = 0.1)','boltzmann, (invT = 1)','boltzmann, (invT = 2)','boltzmann, (invT = 10)', 
         'pursuit, (beta = 0.01)','pursuit, (beta = 0.1)','pursuit, (beta = 0.2)',
         'reinforcement, (a=b=0.001)',
         'UCB']


mins,maxes,means = baseline(random.pull,'random',agents,names,20)