# Santa 2020 Candy Cane - Optimized UCB

The Multi-Armed Bandit Problem is described here: https://en.wikipedia.org/wiki/Multi-armed_bandit

In this notebook we will be taking a random distribution based on the UCB Upper Confidence Bound Score:
```
score = (our_reward / our_visits) + sqrt( exploration * log(total_visits) / our_visits ) 
```

We also implement a greedy warmup phase, repeatedly pulling each bandit until we reach N losses, as well as rewarding opponent moves on the assumption they are also looking for high payout bandits.

In [None]:
!pip install kaggle-environments --upgrade 

In [None]:
%%writefile submission.py
import random
import numpy as np

class UCBAgent:
    # Optimal Settings:
    # exploration=12,  opp_reward=0.6, warmup=1, choose='max'
    # exploration=0.1, opp_reward=0.1, warmup=3, choose='random'
    def __init__(self, exploration=0.2,  opp_reward=0.2, warmup=2, winrate=0.8, choose='max', verbose=True):
        self.exploration = exploration
        self.choose      = choose
        self.opp_reward  = opp_reward
        self.warmup      = warmup
        self.winrate     = winrate
        self.verbose     = verbose
        self.history     = None
        self.state       = None

        
    def init_state(self, observation, configuration, force=False):
        if self.state is None or force:
            self.history = {
                "actions":  [],
                "opponent": [],
                "reward":   [],
            }
            self.state = {
                "our_rewards":  np.zeros(configuration.banditCount, dtype=np.float),
                "opp_rewards":  np.zeros(configuration.banditCount, dtype=np.float),
                "our_visits":   np.zeros(configuration.banditCount, dtype=np.float),
                "opp_visits":   np.zeros(configuration.banditCount, dtype=np.float),                
                "total_visits": np.zeros(configuration.banditCount, dtype=np.float),                
            }        
        
        
    def update_state(self, observation, configuration):
        if self.state is None:
            self.init_state(observation, configuration)
        
        self.history['reward'].append( observation.reward )
        if len(self.history['actions']):
            # observation.reward is cumulative reward
            our_reward      = int(self.history['reward'][-1] > self.history['reward'][-2])
            our_last_action = self.history['actions'][-1]
            if len( set(observation.lastActions) ) == 1:
                opp_last_action = our_last_action
            else:
                opp_last_action = list( set(observation.lastActions) - {our_last_action} )[0]
            self.history['opponent'].append(opp_last_action)

            self.state['our_rewards'][  our_last_action ] += our_reward
            self.state['opp_rewards'][  opp_last_action ] += self.opp_reward
            self.state['our_visits'][   our_last_action ] += 1
            self.state['opp_visits'][   opp_last_action ] += 1
            self.state['total_visits'][ our_last_action ] += 1
            self.state['total_visits'][ opp_last_action ] += 1
            
    
        
    def scores(self, observation, configuration):
        total_visits = np.sum(self.state['our_visits']) + 1
        our_visits   = np.max([ self.state['our_visits'], np.ones(len(self.state['our_visits'])) ])
        scores = (
            (self.state['our_rewards'] + self.state['opp_rewards']) / our_visits 
            + np.sqrt( self.exploration * np.log(total_visits) / our_visits )
        )
        scores *= configuration.decayRate ** self.state['total_visits']
        return scores

        
    # observation   {'remainingOverageTime': 60, 'step': 1, 'reward': 1, 'lastActions': [54, 94]}
    # configuration {'episodeSteps': 2000, 'actTimeout': 0.25, 'runTimeout': 1200, 'banditCount': 100, 'decayRate': 0.97, 'sampleResolution': 100}
    def agent(self, observation, configuration):

        self.update_state(observation, configuration)

        scores = self.scores(observation, configuration)

        winners  = np.argwhere( (self.state['our_visits'] != 0) 
                              & ( 
                                    (self.state['our_visits'] <= self.state['our_rewards'] + (self.warmup - 1)) 
                                  | (
                                      np.nan_to_num(self.state['our_rewards'] / self.state['our_visits']) 
                                      >= self.winrate * configuration.decayRate ** (observation.step/configuration.banditCount)
                                    )
                                )
                              ).flatten()
        untried  = np.argwhere( self.state['our_visits'] == 0).flatten()
        
        if self.warmup and len(winners):
            action = np.random.choice(winners)  # keep trying winners until we lose
        elif self.warmup and len(untried):
            action = np.random.choice(untried)
        else:
            if self.choose == 'random':
                action = random.choices( population=np.arange(len(scores)), weights=scores, k=1 )[0]        
            elif self.choose == 'max':
                action = np.argmax(scores)
            else:
                assert False, self.choose 
                
        if self.verbose:
            if True or observation.step < configuration.banditCount:
                print()
                print('observation = ', observation)
                print(f'scores = {list(scores.round(2))}')
                for key, values in self.state.items():
                    print(f'self.state["{key}"] = {list(values)}')
                print(f'action = {action}')

        self.history['actions'].append(action)
        return int(action)

    
    def __call__(self, observation, configuration):
        return self.agent(observation, configuration)
    
ucb_instance = UCBAgent() 
def ucb_agent(observation, configuration):
    return ucb_instance.agent(observation, configuration)

In [None]:
%run submission.py

In [None]:
from kaggle_environments import evaluate, make, utils
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

Agent successfully plays against itself

In [None]:
env = make("mab", debug=False)
env.reset()
env.run(["submission.py", "submission.py"])
evaluate("mab", ["submission.py", "../input/candy-cane-random-agent/submission.py"])

UCB Agent can consistently beat random agent by a small margin 

In [None]:
%%time

env = make("mab", debug=False)
env.reset()
results = np.array(Parallel(-1)([
    delayed(evaluate)("mab", [UCBAgent().agent, "../input/candy-cane-random-agent/submission.py"])    
    for n in range(10)
])).reshape(-1,2)
print('results:\n', results)
print('mean: ', np.mean(results, axis=0).round(1))
print('std:  ', np.std(results,  axis=0).round(1))

# Warmup

- choose=max | warmup=1 is best
- choose=random | warmup=3 is best  

In [None]:
%%time
# DOCS: https://www.kaggle.com/jamesmcguigan/rock-paper-scissors-agents-comparison/
for choose in [ 'max', 'random' ]:
    warmups = [ 0, 1, 2, 3 ] if choose == 'max' else [ 0, 1, 2, 3, 4, 5, 6 ]
    scores  = np.zeros(( len(warmups), len(warmups) ), dtype=np.int)

    def evaluate_warmups(i1, e1, i2, e2):
        exploration = 8 if choose == 'max' else 0.1
        result = evaluate("mab", [
            UCBAgent(exploration=exploration, opp_reward=0, choose=choose, warmup=e1), 
            UCBAgent(exploration=exploration, opp_reward=0, choose=choose, warmup=e2)
        ])
        return (i1, e1, i2, e2, result)

    results = Parallel(-1)( 
        delayed(evaluate_warmups)(i1, e1, i2, e2)
        for i1, e1 in enumerate(warmups)
        for i2, e2 in enumerate(warmups)
        # for n in range(2)
    )
    for (i1, e1, i2, e2, result) in results:
        scores[i1, i2] += (result[0][0] or 0) - (result[0][1] or 0)
        scores[i2, i1] += (result[0][1] or 0) - (result[0][0] or 0)

    df_scores = pd.DataFrame(
        scores, 
        index   = list(map(lambda n: f'{n:.0f}', warmups)), 
        columns = list(map(lambda n: f'{n:.0f}', warmups)),
    )
    plt.figure(figsize=(scores.shape[0]//1.5, scores.shape[1]//1.5))
    plt.title(f'warmup | choose={choose}')
    sns.heatmap(
        df_scores, annot=True, cbar=False, 
        cmap='coolwarm', linewidths=1, 
        linecolor='black', fmt="d",
    )
    plt.tick_params(labeltop=True, labelright=True)
    plt.xticks(rotation=90, fontsize=11)
    plt.yticks(rotation=0,  fontsize=11)
    print(f'warmup | choose={choose}')
    print(df_scores.mean(axis=1))

# Choose

choose='random' is better than choose='max'

In [None]:
%%time

env = make("mab", debug=False)
env.reset()
results = np.array(Parallel(-1)([
    delayed(evaluate)("mab", [
        UCBAgent(choose='max',    exploration=12), 
        UCBAgent(choose='random', exploration=0.1),
    ])    
    for n in range(10)
])).reshape(-1,2)
print('results:\n', results)
print('mean: ', np.mean(results, axis=0).round(1))
print('std:  ', np.std(results,  axis=0).round(1))

# Exploration
- choose=random + warmup=True | Smaller exploration is better, 0.2 is optimal
- choose=max    + warmup=True | Higher  exploration is better, 12 is optimal

In [None]:
%%time
# DOCS: https://www.kaggle.com/jamesmcguigan/rock-paper-scissors-agents-comparison/

explorations = [ 0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 32 ]
# explorations = list(np.arange(0.0, 1.0, 0.1)) + [ 1, 1.41, 2 ]
scores       = np.zeros(( len(explorations), len(explorations) ), dtype=np.int)


def evaluate_explorations(i1, e1, i2, e2):
    result = evaluate("mab", [
        UCBAgent(exploration=e1, opp_reward=0, choose='max', warmup=True), 
        UCBAgent(exploration=e2, opp_reward=0, choose='max', warmup=True)
    ])
    return (i1, e1, i2, e2, result)

results = Parallel(-1)( 
    delayed(evaluate_explorations)(i1, e1, i2, e2)
    for i1, e1 in enumerate(explorations)
    for i2, e2 in enumerate(explorations)
    # for n in range(2)
)
for (i1, e1, i2, e2, result) in results:
    scores[i1, i2] += (result[0][0] or 0) - (result[0][1] or 0)
    scores[i2, i1] += (result[0][1] or 0) - (result[0][0] or 0)
    
df_scores = pd.DataFrame(
    scores, 
    index   = list(map(lambda n: f'{n:.1f}', explorations)), 
    columns = list(map(lambda n: f'{n:.1f}', explorations)),
)
plt.figure(figsize=(scores.shape[0]//1.5, scores.shape[1]//1.5))
plt.title('exploration | choose=max')
sns.heatmap(
    df_scores, annot=True, cbar=False, 
    cmap='coolwarm', linewidths=1, 
    linecolor='black', fmt="d",
)
plt.tick_params(labeltop=True, labelright=True)
plt.xticks(rotation=90, fontsize=11)
plt.yticks(rotation=0,  fontsize=11)
print(df_scores.mean(axis=1))

In [None]:
%%time
# DOCS: https://www.kaggle.com/jamesmcguigan/rock-paper-scissors-agents-comparison/

# explorations = [ 0.1, 0.25, 0.5, 1, 1.4, 2, 4, 8, 16, 32 ]
explorations = list(np.arange(0.0, 1.0, 0.05))
scores       = np.zeros(( len(explorations), len(explorations) ), dtype=np.int)


def evaluate_explorations(i1, e1, i2, e2):
    result = evaluate("mab", [
        UCBAgent(exploration=e1, opp_reward=0, choose='random', warmup=3), 
        UCBAgent(exploration=e2, opp_reward=0, choose='random', warmup=3)
    ])
    return (i1, e1, i2, e2, result)

results = Parallel(-1)( 
    delayed(evaluate_explorations)(i1, e1, i2, e2)
    for i1, e1 in enumerate(explorations)
    for i2, e2 in enumerate(explorations)
    # for n in range(2)
)
for (i1, e1, i2, e2, result) in results:
    scores[i1, i2] += (result[0][0] or 0) - (result[0][1] or 0)
    scores[i2, i1] += (result[0][1] or 0) - (result[0][0] or 0)
    
df_scores = pd.DataFrame(
    scores, 
    index   = list(map(lambda n: f'{n:.2f}', explorations)), 
    columns = list(map(lambda n: f'{n:.2f}', explorations)),
)
plt.figure(figsize=(scores.shape[0]//1.5, scores.shape[1]//1.5))
plt.title('exploration | choose=random')
sns.heatmap(
    df_scores, annot=True, cbar=False, 
    cmap='coolwarm', linewidths=1, 
    linecolor='black', fmt="d",
)
plt.tick_params(labeltop=True, labelright=True)
plt.xticks(rotation=90, fontsize=11)
plt.yticks(rotation=0,  fontsize=11)
print(df_scores.mean(axis=1))

# opt_reward
- choose=max    | opt_reward prefers 0.6+
- choose=random | opt_reward prefers 0.1 if opponent is not copying, but 0.3 or 0.8 if they are

In [None]:
%%time

opp_rewards = list(np.arange(0.0, 1.0, 0.1))
scores      = np.zeros(( len(opp_rewards), len(opp_rewards) ), dtype=np.int)

def evaluate_opp_rewards(i1, e1, i2, e2):
    result = evaluate("mab", [
        UCBAgent(opp_reward=e1, choose='max', exploration=8, warmup=1), 
        UCBAgent(opp_reward=e2, choose='max', exploration=8, warmup=1),
    ])
    return (i1, e1, i2, e2, result)

results = Parallel(-1)( 
    delayed(evaluate_opp_rewards)(i1, e1, i2, e2)
    for i1, e1 in enumerate(opp_rewards)
    for i2, e2 in enumerate(opp_rewards)
    # for n in range(2)
)
for (i1, e1, i2, e2, result) in results:
    scores[i1, i2] += (result[0][0] or 0) - (result[0][1] or 0)
    scores[i2, i1] += (result[0][1] or 0) - (result[0][0] or 0)
    
df_scores = pd.DataFrame(
    scores, 
    index   = list(map(lambda n: f'{n:.1f}', opp_rewards)), 
    columns = list(map(lambda n: f'{n:.1f}', opp_rewards)),
)
plt.figure(figsize=(scores.shape[0]//1.5, scores.shape[1]//1.5))
plt.title('opp_reward | choose=max')
sns.heatmap(
    df_scores, annot=True, cbar=False, 
    cmap='coolwarm', linewidths=1, 
    linecolor='black', fmt="d",
)
plt.tick_params(labeltop=True, labelright=True)
plt.xticks(rotation=90, fontsize=11)
plt.yticks(rotation=0,  fontsize=11)
print(df_scores.mean(axis=1))

In [None]:
%%time

opp_rewards = list(np.arange(0.0, 1.0, 0.1))
scores      = np.zeros(( len(opp_rewards), len(opp_rewards) ), dtype=np.int)

def evaluate_opp_rewards(i1, e1, i2, e2):
    result = evaluate("mab", [
        UCBAgent(opp_reward=e1, choose='random', exploration=0.1, warmup=3), 
        UCBAgent(opp_reward=e2, choose='random', exploration=0.1, warmup=3),
    ])
    return (i1, e1, i2, e2, result)

results = Parallel(-1)( 
    delayed(evaluate_opp_rewards)(i1, e1, i2, e2)
    for i1, e1 in enumerate(opp_rewards)
    for i2, e2 in enumerate(opp_rewards)
    # for n in range(2)
)
for (i1, e1, i2, e2, result) in results:
    scores[i1, i2] += (result[0][0] or 0) - (result[0][1] or 0)
    scores[i2, i1] += (result[0][1] or 0) - (result[0][0] or 0)
    
df_scores = pd.DataFrame(
    scores, 
    index   = list(map(lambda n: f'{n:.1f}', opp_rewards)), 
    columns = list(map(lambda n: f'{n:.1f}', opp_rewards)),
)
plt.figure(figsize=(scores.shape[0]//1.5, scores.shape[1]//1.5))
plt.title('opp_reward | choose=random')
sns.heatmap(
    df_scores, annot=True, cbar=False, 
    cmap='coolwarm', linewidths=1, 
    linecolor='black', fmt="d",
)
plt.tick_params(labeltop=True, labelright=True)
plt.xticks(rotation=90, fontsize=11)
plt.yticks(rotation=0,  fontsize=11)
print(df_scores.mean(axis=1))

# winrate


High variance result
- choose=max | winrate=0.85
- choose=random | winrate=0.8

In [None]:
%%time
# DOCS: https://www.kaggle.com/jamesmcguigan/rock-paper-scissors-agents-comparison/
for choose in [ 'max', 'random' ]:
    winrates = [ 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.6, 0.5 ]
    scores   = np.zeros(( len(winrates), len(winrates) ), dtype=np.int)

    def evaluate_warmups(i1, e1, i2, e2):
        exploration = 12 if choose == 'max' else 0.2
        warmup      =  1 if choose == 'max' else 2 
        result = evaluate("mab", [
            UCBAgent(exploration=exploration, opp_reward=0, choose=choose, warmup=warmup, winrate=e1), 
            UCBAgent(exploration=exploration, opp_reward=0, choose=choose, warmup=warmup, winrate=e2)
        ])
        return (i1, e1, i2, e2, result)

    results = Parallel(-1)( 
        delayed(evaluate_warmups)(i1, e1, i2, e2)
        for i1, e1 in enumerate(winrates)
        for i2, e2 in enumerate(winrates)
        # for n in range(2)
    )
    for (i1, e1, i2, e2, result) in results:
        scores[i1, i2] += (result[0][0] or 0) - (result[0][1] or 0)
        scores[i2, i1] += (result[0][1] or 0) - (result[0][0] or 0)

    df_scores = pd.DataFrame(
        scores, 
        index   = list(map(lambda n: f'{n:.2f}', winrates)), 
        columns = list(map(lambda n: f'{n:.2f}', winrates)),
    )
    plt.figure(figsize=(scores.shape[0]//1.5, scores.shape[1]//1.5))
    plt.title(f'winrates | choose={choose}')
    sns.heatmap(
        df_scores, annot=True, cbar=False, 
        cmap='coolwarm', linewidths=1, 
        linecolor='black', fmt="d",
    )
    plt.tick_params(labeltop=True, labelright=True)
    plt.xticks(rotation=90, fontsize=11)
    plt.yticks(rotation=0,  fontsize=11)
    print(f'winrates | choose={choose}')
    print(df_scores.mean(axis=1))