# Santa 2020 - Agents Comparison

This notebook attempts to compare the various public notebooks for this competition.

If you have a public notebook not included here, please leave a comment and a link for inclusion, else fork and simply add your own private notebooks as a dataset and rerun.

Inspired by:
- https://www.kaggle.com/ihelon/rock-paper-scissors-agents-comparison
- https://www.kaggle.com/naokimaeda/local-evaluation

In [None]:
!pip install kaggle-environments --upgrade -q

In [None]:
import time
import math
import glob
import re
import os
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from collections import defaultdict
from joblib import Parallel, delayed
from kaggle_environments import evaluate, make, utils

# Agents

In [None]:
excludes = [ "random_agent.py" ]  # + [ "always_first_agent.py" ]
agent_files = glob.glob('../input/**/*.py')
agent_files = [ name for name in agent_files if not any( exclude in name for exclude in excludes ) ]
agent_names = map(lambda name: re.sub('\.\./input/|/submission.py','', name), agent_files)
agent_names = map(lambda name: re.sub('^((rock|paper|candy|cane|santa|2020|get-started-to)[.-]?)+|-agent|\.py$','', name), agent_names)
agent_names = map(lambda name: re.sub('/', ' \n ', name), agent_names)
agents = { k: v for k, v in zip(agent_files, agent_names) }
agents

In [None]:
!rm -rf ./agents
!rsync -r ../input ./agents
!find ./agents/ -type f -not -name '*.py' -delete
!find ./agents/ -type d -name '__results___files' -delete
# !find ./agents

# Evaluation

In [None]:
%%time
def evaluate_mab(i1, i2, agent1, agent2):
    print(i1, i2, agent1, agent2)
    try:
        result = evaluate("mab", [ agent1, agent2 ])
        result = np.array(result).flatten()
    except:
        result = np.array([0,0])
    return (i1, i2, result)
    

    
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Interactive':
    agents = { key: agents[key] for key in sorted(agents.keys())[-4:] + ["../input/santa-2020-starter/always_first_agent.py"] } # for debugging
    display(agents)

safety_time  = 3*60*60
max_notebook = 9*60*60
time_end     = time.perf_counter() + max_notebook - safety_time
results      = []
while time.perf_counter() < time_end:
    results += Parallel(-1)( 
        delayed(evaluate_mab)(i1, i2, agent1, agent2) 
        for i1, agent1 in enumerate(agents.keys())
        for i2, agent2 in enumerate(agents.keys())
    )
    if os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Interactive': break
    if os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Batch':       continue

# results

In [None]:
def df_half(df, ratio=0.5):
    rows = int(len(df) * ratio)
    cols = df.columns[:rows]
    return df[:rows][cols]

def df_sort(df, by=None, ascending=False):
    by = df if by is None else by
    for axis in [0,1]:
        df = df.reindex( by.mean().sort_values(ascending=ascending).index, axis=axis)
    return df

def winrate_score(score1, score2):
    try:
        if score1 == score2: return  0
        if score1 is None:   return -1
        if score2 is None:   return  1
        if score1 >  score2: return  1
        if score1 <  score2: return -1
    except: pass
    return 0
    

scores_agent = defaultdict(list)
scores_total = np.zeros(( len(agents), len(agents) ), dtype=np.int)
scores_diff  = np.zeros(( len(agents), len(agents) ), dtype=np.float)
winrates     = np.zeros(( len(agents), len(agents) ), dtype=np.int)

for (i1, i2, result) in results:
    scores_total[i1,i2] += (result[0] or 0)
    scores_total[i2,i1] += (result[1] or 0)
    scores_diff[i1,i2]  += (result[0] or 0) - (result[1] or 0) 
    scores_diff[i2,i1]  += (result[1] or 0) - (result[0] or 0)
    winrates[i1,i2]     += winrate_score(result[0], result[1])
    winrates[i2,i1]     += winrate_score(result[1], result[0])
    scores_agent[ list(agents.values())[i1] ].append( result[0] )
    scores_agent[ list(agents.values())[i2] ].append( result[1] )
    
df_scores_total = pd.DataFrame(
    scores_total, 
    index   = list(agents.values()), 
    columns = list(agents.values()),
)
df_scores_diff = pd.DataFrame(
    scores_diff, 
    index   = list(agents.values()), 
    columns = list(agents.values()),
)
df_winrates = pd.DataFrame(
    winrates, 
    index   = list(agents.values()), 
    columns = list(agents.values()),
)
df_scores_agent = pd.DataFrame(scores_agent)

# Sort by mean score
df_scores_total = df_sort(df_scores_total, by=df_scores_agent)
df_scores_diff  = df_sort(df_scores_diff,  by=df_scores_agent)
df_winrates     = df_sort(df_winrates,     by=df_scores_agent)
df_scores_agent = df_scores_agent.reindex( df_scores_agent.mean().sort_values(ascending=False).index, axis=1)

display(df_scores_agent.T)
display(df_scores_agent.T.mean(axis=1))

In [None]:
def plot_df_heatmap(df, title, doublelabel=False, **kwargs):
    plt.figure(figsize=(df.shape[1], df.shape[0]))
    plt.title(title)
    sns.heatmap(
        df, annot=True, cbar=False, 
        cmap='coolwarm', linewidths=1, 
        linecolor='black', 
        fmt='.0f',
        **kwargs
    )
    plt.tick_params(labeltop=doublelabel, labelright=doublelabel)
    plt.xticks(rotation=90, fontsize=max(10,df.shape[0]//1.5))
    plt.yticks(rotation=0,  fontsize=max(10,df.shape[0]//1.5))
    print(title)
    print(df.mean(axis=1).sort_values(ascending=False))

# Scores vs Always First Agent

A good baseline metric is to compare scores against the Always First Agent. 

This shows how well an agent can exploit its envronment in the absence of any competition.

The score for the Always First Agent also gives an indication as the average maximum payout from a single bandit. 

In [None]:
df_scores_vs_always_first = df_scores_total[["starter \n always_first_agent"]]
df_scores_vs_always_first.T.mean().sort_values(ascending=False).T

# Total Scores

This shows the total score of each agent, showing how many resources it can extract before the opponent.

In [None]:
plot_df_heatmap(df_scores_total, 'Total Scores')

In [None]:
plot_df_heatmap(df_sort(df_half(df_scores_total), ascending=True), 'Total Scores', doublelabel=False)

# Relative Scores

Winning this game doesn't actually depend on getting a high total score, simply getting a higher score than your opponent.

In [None]:
plot_df_heatmap(df_scores_diff, 'Relative Scores')

In [None]:
plot_df_heatmap(df_sort(df_half(df_scores_diff), ascending=True), 'Relative Scores', doublelabel=False)

# Winrates

Ultimately you only need to need to score one point higher than your opponent, so this plot shows how the leaderboard would actually respond to agents.

In [None]:
plot_df_heatmap(df_sort(df_winrates, ascending=True), 'Winrates')

In [None]:
plot_df_heatmap(df_sort(df_half(df_winrates), ascending=True), 'Winrates', doublelabel=True)

# Boxplots

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def plot_df_boxplot(df, title, columns=8, boxplot_args={}, stripplot_args={}):
    df_orig = df
    n_rows    = math.ceil( len(df.columns) / columns )
    n_columns = math.ceil( len(df.columns) / n_rows  )
    for cols in batch(df.columns, n_columns):
        df = df_orig[cols]
        plt.figure(figsize=(n_columns*2, 5))
        plt.title(title, loc="center")

        stripplot_args = { "facecolor": 'white', **boxplot_args }
        ax = sns.boxplot(data=df, **boxplot_args)
        plt.setp(ax.artists, edgecolor='grey', facecolor='w')
        plt.setp(ax.lines, color='grey')

        stripplot_args = { "jitter": 0.25, "size": 5, **stripplot_args }
        ax = sns.stripplot(data=df, **stripplot_args)

        # ax = sns.swarmplot(data=df_scores_agent)
        plt.xticks(rotation=90, fontsize=15)
        plt.yticks(rotation=0,  fontsize=15)
        pass

In [None]:
plot_df_boxplot(df_scores_agent, "All Matchmaking Scores")