# [Santa 2020 - The Candy Cane Contest:](https://www.kaggle.com/c/santa-2020/) Epsilon-Greedy Solution

[This competion](https://www.kaggle.com/c/santa-2020/) is a twist on the [multi-armed bandit problem](https://en.wikipedia.org/wiki/Multi-armed_bandit) where we compete with another agent to get the most candy canes from 100 vending machines that produce a candy cane by some unknown distiribution. To add icing to the cake, the vending machine reduces the likelihood of producing a candy cane every time an elf (agent) tests that machine.

# Epsilon-Greedy ($\epsilon$-greedy)

There are some intesting strategies to this problem but here we'll try the simple epsilon-greedy solution where we randomly try out different vending machines (explore) $\epsilon$ portion of the times and the rest of the time pick the most optimal machine tried so far.

## Setup

In [None]:
!pip install kaggle-environments --upgrade

## Agent

In [None]:
%%writefile agent-epsilon_greedy.py

import random
random.seed(72)

class Agent:
    
    def __init__(self,eps=7e-1,eps_decay=0.99999):
        self.best_machine = None
        self.total_reward = 0
        self.rewards = None 

        self.bandit_chosen = None
        self.rewards = None
        self.selections = None
        self.epsilon = eps
        self.decay = eps_decay
        
    def decay_eps(self):
        self.epsilon = self.epsilon * self.decay
        

myagent = Agent(eps=9e-1)

def agent(observation, configuration):
    '''
    '''
    # Need to preserve this information across runs
#     global best_machine, total_reward, rewards, bandit_chosen, rewards, selections, possible_machines
#     global DEFAULT_EPS
    
    # Determine the reward-score of selection
    def get_reward_score(bandit):
        '''
        '''
        # No reward for no bandit
        if bandit is None:
            score = 0
        # Average of rewards
        else:
            score = myagent.rewards[bandit]/myagent.selections[bandit] 
        return score

    
    # Record reward from last round
    if myagent.bandit_chosen is not None:
        # Difference from last reward and ttoal reward
        last_reward = observation.reward - myagent.total_reward
        myagent.rewards[myagent.bandit_chosen] += last_reward
        myagent.total_reward += last_reward
        # Check if this is now the best solution
        # On the first two runs, the solution is undefined or the last chosen
        if (myagent.best_machine is None): 
            myagent.best_machine = myagent.bandit_chosen
        # Use the average rewards to determine the "best"
        elif get_reward_score(myagent.bandit_chosen) >= get_reward_score(myagent.best_machine):
            myagent.best_machine = myagent.bandit_chosen
            
    
    
    # Number of vending machines to select & rewards so far from selections
    if observation.step == 0:
        myagent.selections = [0] * configuration.banditCount
        myagent.possible_machines = list(range(configuration.banditCount))
        myagent.rewards = [0] * configuration.banditCount


    # Determine if explore or exploit
    eps = myagent.epsilon
    beta = random.random()
    # Explore only eps portion of the time
    explore = (beta < eps) or (myagent.best_machine is None)
        
    # Pick randomly if exploring
    if explore:
        myagent.bandit_chosen = random.choice(myagent.possible_machines)
    # Find the best vending machine if exploiting
    else:
        myagent.bandit_chosen = myagent.best_machine
    
    # Record selection
    myagent.selections[myagent.bandit_chosen] += 1
    
    # Decay (if any)
    myagent.decay_eps()

    return myagent.bandit_chosen

## Evaluation

In [None]:
from kaggle_environments import make

In [None]:
env = make("mab", debug=True)

env.run(["agent-epsilon_greedy.py", "agent-epsilon_greedy.py"])
env.render(mode="ipython", width=800, height=300)

# Try Different Exploring Options

Here we're including the visualizer I made (see this notebook: https://www.kaggle.com/mrgeislinger/visualizing-reward-outcomes) 

In [None]:
# import module we'll need to import our custom module
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src="../input/visualizing-reward-outcomes/SimulationExplorer.py", 
         dst= "../working/SimulationExplorer.py")

# import all our functions
import SimulationExplorer as Explorer

In [None]:
import random
random.seed(27)

class Agent:
    
    def __init__(self,eps=5e-1, eps_decay=1.0):
        self.best_machine = None
        self.total_reward = 0
        self.rewards = None 

        self.bandit_chosen = None
        self.rewards = None
        self.selections = None
        self.epsilon = eps
        self.decay = eps_decay
        
    def decay_eps(self):
        self.epsilon = self.epsilon * self.decay

In [None]:
def agent(observation, configuration):
    '''
    '''
    
    # Determine the reward-score of selection
    def get_reward_score(bandit):
        '''
        '''
        # No reward for no bandit
        if bandit is None:
            score = 0
        # Average of rewards
        else:
            score = myagent.rewards[bandit]/myagent.selections[bandit] 
        return score

    
    # Record reward from last round
    if myagent.bandit_chosen is not None:
        # Difference from last reward and ttoal reward
        last_reward = observation.reward - myagent.total_reward
        myagent.rewards[myagent.bandit_chosen] += last_reward
        myagent.total_reward += last_reward
        # Check if this is now the best solution
        # On the first two runs, the solution is undefined or the last chosen
        if (myagent.best_machine is None): 
            myagent.best_machine = myagent.bandit_chosen
        # Use the average rewards to determine the "best"
        elif get_reward_score(myagent.bandit_chosen) >= get_reward_score(myagent.best_machine):
            myagent.best_machine = myagent.bandit_chosen
            
    
    
    # Number of vending machines to select & rewards so far from selections
    if observation.step == 0:
        print(f'Eps: {myagent.epsilon}')
        myagent.selections = [0] * configuration.banditCount
        myagent.possible_machines = list(range(configuration.banditCount))
        myagent.rewards = [0] * configuration.banditCount


    # Determine if explore or exploit
    eps = myagent.epsilon
    beta = random.random()
    # Explore only eps portion of the time
    explore = (beta < eps) or (myagent.best_machine is None)
        
    # Pick randomly if exploring
    if explore:
        myagent.bandit_chosen = random.choice(myagent.possible_machines)
    # Find the best vending machine if exploiting
    else:
        myagent.bandit_chosen = myagent.best_machine
    
    # Record selection
    myagent.selections[myagent.bandit_chosen] += 1
    
    # Decay (if any)
    myagent.decay_eps()

    return myagent.bandit_chosen

In [None]:
def rand_agent(obs,conf):
    import numpy as np
    return int(np.random.choice(np.arange(conf.banditCount)))

In [None]:
all_eps = [1e-1, 5e-1, 7e-1, 9e-1, 9.5e-1]

In [None]:
import time


sims = {}
for e in all_eps[1:]:

    for trial in range(2):
        start_time = time.time()
        decay = 0.99999 if trial % 2 else 1.0
        myagent = Agent(e,decay)
        env = make("mab", debug=True)
        env.run([agent, rand_agent])

        #
        name = f'egreed_{e}d{decay} v rand: #{trial}'
        sims[name] = env
        print(f'\t{time.time()-start_time}')
    
test = Explorer.SimViz(sims)
test.plot_total_reward()