# Candy Cane - Multi-Armed Bandit

This notebook shows how to use multi-armed bandit.

Multi-armed bandit is a widely used RL-algorithm because it is very balanced in terms of exploitation/exploration.

Algorithm logic:

- At each step for each bandit generate a random number from B(a+1, b+1). B - beta-distribution, a - decay adjusted total reward from this bandit, b - number of this bandits's historical losses.
- Select the bandit with the largest generated number and use it to generate the next step
- Unless we won last round, in which case repeat the last action

In [None]:
!pip install kaggle-environments --upgrade -q

In [None]:
%%writefile submission.py

import json
import numpy as np
import pandas as pd

class MultiArmedBandit:

    def __init__(self, no_reward_step=0.95, retry_winrate=False):
        self.no_reward_step = no_reward_step
        self.retry_winrate  = retry_winrate

        self.bandit_state   = None
        self.total_reward   = 0
        self.last_step      = 0
        
    def __call__(self, obs, conf):
        return self.agent(obs, conf)
        
        
    # observation   {'remainingOverageTime': 60, 'agentIndex': 1, 'reward': 0, 'step': 0, 'lastActions': []}
    # configuration {'episodeSteps': 2000, 'actTimeout': 0.25, 'runTimeout': 1200, 'banditCount': 100, 'decayRate': 0.97, 'sampleResolution': 100}
    def agent(self, obs, conf):
        # print('observation',   obs)
        # print('configuration', conf)
        # print('self.bandit_state', self.bandit_state)
        # global history, history_bandit
        # global bandit_state,total_reward,last_step

        # updating bandit_state using the result of the previous step
        last_reward       = obs.reward - self.total_reward
        self.total_reward = obs.reward
        
        if obs.step == 0:
            # initial bandit state
            self.bandit_state = [[1,1] for i in range(conf.banditCount)]
        else:       
            if last_reward > 0:
                self.bandit_state[ obs.lastActions[obs.agentIndex] ][0] += last_reward
            else:
                self.bandit_state[ obs.lastActions[obs.agentIndex] ][1] += self.no_reward_step

            self.bandit_state[ obs.lastActions[0] ][0] = (
                (self.bandit_state[ obs.lastActions[0] ][0] - 1) * conf.decayRate + 1
            )
            self.bandit_state[ obs.lastActions[1] ][0] = (
                (self.bandit_state[ obs.lastActions[1] ][0] - 1) * conf.decayRate + 1
            )

            
        # Repeat last action if we got a reward
        if self.retry_winrate and last_reward == 1:
            best_agent = self.last_step

        # generate random number from Beta distribution for each agent and select the most lucky one            
        else:            
            best_proba = -1
            best_agent = self.last_step  # None
            for k in range(conf.banditCount):
                proba = np.random.beta( self.bandit_state[k][0], self.bandit_state[k][1] )
                if proba > best_proba:
                    best_proba = proba
                    best_agent = k

        self.last_step = best_agent
        return best_agent
    
    
MultiArmedBandit_instance = MultiArmedBandit()
def MultiArmedBandit_agent(obs, conf):
    return MultiArmedBandit_instance(obs, conf)

In [None]:
%%writefile random_agent.py

import random
class RandomAgent():
    def __call__(observation, configuration):
        return random.randrange(configuration.banditCount)
    
def random_agent(observation, configuration):
    return random.randrange(configuration.banditCount)

In [None]:
%run submission.py
%run random_agent.py

In [None]:
from kaggle_environments import make, evaluate

print(["submission.py", "random_agent.py"], evaluate("mab", ["submission.py", "random_agent.py"]) )
print(["submission.py", "submission.py  "], evaluate("mab", ["submission.py", "submission.py"]) )
print(["MultiArmedBandit(0.4)", "MultiArmedBandit(0.75)"], evaluate("mab", [MultiArmedBandit(no_reward_step=0.4), MultiArmedBandit(no_reward_step=0.75)]) )

In [None]:
from kaggle_environments import make

env = make("mab", debug=True)

env.reset()
env.run(["submission.py", "random_agent.py"])
env.render(mode="ipython", width=500, height=500)

# Hyperparameters

In [None]:
import glob
import re
import os
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from collections import defaultdict
from joblib import Parallel, delayed
from kaggle_environments import evaluate, make, utils

In [None]:
%%time
agents = {
    MultiArmedBandit(no_reward_step=n, retry_winrate=retry_winrate): f'no_reward_step={n:.2f} retry_winrate={retry_winrate}'
    for n in np.arange(0.75, 1.0, 0.05)
    for retry_winrate in [ True, False ]
}
agents["random_agent.py"] = "random_agent" 
agents["../input/rock-paper-candy-copy-opponent-move-unless-win/submission.py"] = "copy_opponent_unless_win" 
agents["../input/candy-cane-optimized-ucb/submission.py"]                       = "optimized-ucb"
# print(agents)

def evaluate_mab(i1, i2, agent1, agent2):
    # print(i1, i2, agent1, agent2)
    try:
        result = evaluate("mab", [ agent1, agent2 ])
        result = np.array(result).flatten()
    except:
        result = np.array([0,0])
    return (i1, i2, result)
    
results = Parallel(-1)( 
    delayed(evaluate_mab)(i1, i2, agent1, agent2) 
    for i1, agent1 in enumerate(agents.keys())
    for i2, agent2 in enumerate(agents.keys())
    for n in range(10)
    if i1 < i2
)
# results

In [None]:
def winrate_score(score1, score2):
    try:
        if score1 == score2: return  0
        if score1 is None:   return -1
        if score2 is None:   return  1
        if score1 >  score2: return  1
        if score1 <  score2: return -1
    except: pass
    return 0
    

scores_agent = defaultdict(list)
scores_total = np.zeros(( len(agents), len(agents) ), dtype=np.int)
scores_diff  = np.zeros(( len(agents), len(agents) ), dtype=np.float)
winrates     = np.zeros(( len(agents), len(agents) ), dtype=np.int)

for (i1, i2, result) in results:
    scores_total[i1,i2] += (result[0] or 0)
    scores_total[i2,i1] += (result[1] or 0)
    scores_diff[i1,i2]  += (result[0] or 0) - (result[1] or 0) 
    scores_diff[i2,i1]  += (result[1] or 0) - (result[0] or 0)
    winrates[i1,i2]     += winrate_score(result[0], result[1])
    winrates[i2,i1]     += winrate_score(result[1], result[0])
    scores_agent[ list(agents.values())[i1] ].append( result[0] )
    scores_agent[ list(agents.values())[i2] ].append( result[1] )
    
df_scores_total = pd.DataFrame(
    scores_total, 
    index   = list(agents.values()), 
    columns = list(agents.values()),
)
df_scores_diff = pd.DataFrame(
    scores_diff, 
    index   = list(agents.values()), 
    columns = list(agents.values()),
)
df_winrates = pd.DataFrame(
    winrates, 
    index   = list(agents.values()), 
    columns = list(agents.values()),
)
df_scores_agent = pd.DataFrame(scores_agent)

# Sort by mean score
for axis in [0,1]:
    df_scores_total = df_scores_total.reindex( df_scores_agent.mean().sort_values(ascending=False).index, axis=axis)
    df_scores_diff  = df_scores_diff.reindex(  df_scores_agent.mean().sort_values(ascending=False).index, axis=axis)
    df_winrates     = df_winrates.reindex(     df_scores_agent.mean().sort_values(ascending=False).index, axis=axis)
df_scores_agent = df_scores_agent.reindex( df_scores_agent.mean().sort_values(ascending=False).index, axis=1)


df_scores_agent.T

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

        
def plot_df_heatmap(df, title, **kwargs):
    plt.figure(figsize=(df.shape[0], df.shape[1]))
    plt.title(title)
    sns.heatmap(
        df, annot=True, cbar=False, 
        cmap='coolwarm', linewidths=1, 
        linecolor='black', 
        fmt='.0f',
        **kwargs
    )
    plt.tick_params(labeltop=True, labelright=True)
    plt.xticks(rotation=90, fontsize=max(10,df.shape[0]))
    plt.yticks(rotation=0,  fontsize=max(10,df.shape[0]))
    print(title)
    print(df.mean(axis=1).sort_values(ascending=False))
    
    
def plot_df_boxplot(df, title, columns=10, boxplot_args={}, stripplot_args={}):
    df_orig = df
    n_rows    = math.ceil( len(df.columns) / columns )
    n_columns = math.ceil( len(df.columns) / n_rows  )
    for cols in batch(df.columns, n_columns):
        df = df_orig[cols]
        plt.figure(figsize=(n_columns*2, n_rows*6))
        plt.title(title, loc="center")

        stripplot_args = { "facecolor": 'white', **boxplot_args }
        ax = sns.boxplot(data=df, **boxplot_args)
        plt.setp(ax.artists, edgecolor='grey', facecolor='w')
        plt.setp(ax.lines, color='grey')

        stripplot_args = { "jitter": 0.33, "size": 5, **stripplot_args }
        ax = sns.stripplot(data=df, **stripplot_args)

        # ax = sns.swarmplot(data=df_scores_agent)
        plt.xticks(rotation=90, fontsize=15)
        plt.yticks(rotation=0,  fontsize=15)
        pass

In [None]:
plot_df_heatmap(df_scores_total, 'Total Scores')

In [None]:
plot_df_heatmap(df_scores_diff, 'Relative Scores')

In [None]:
plot_df_boxplot(df_scores_agent, "All Matchmaking Scores")

# Further Reading

This notebook is part of a series exploring the Santa2020 Candy Cane competition
- [Rock Paper Candy - Copy Opponent Move](https://www.kaggle.com/jamesmcguigan/rock-paper-candy-copy-opponent-move)
- [Rock Paper Candy - Copy Opponent Move Unless Win](https://www.kaggle.com/jamesmcguigan/rock-paper-candy-copy-opponent-move-unless-win)
- [Candy Cane - Multi-Armed Bandit](https://www.kaggle.com/jamesmcguigan/candy-cane-multi-armed-bandit)
- [Candy Cane - Optimized UCB](https://www.kaggle.com/jamesmcguigan/candy-cane-optimized-ucb)
- [Candy Cane - Random Agent](https://www.kaggle.com/jamesmcguigan/candy-cane-random-agent)

I also created an agents comparison notebook to compare the relative strengths of public agents:
- [Santa 2020 - Agents Comparison](https://www.kaggle.com/jamesmcguigan/santa-2020-agents-comparison/)