# Performance Visualisation

How can we better understand _why_ an agent is failing? (or suceeding)

Why is an agent losing?
- incorrect probabily predictions?
- not quick enough to act on them? i.e. opponent stealing?
- not enough exploring?

Following are some of the visualisations I used to diagnose the performance of my agents throughout the competition.

In [None]:
from kaggle_environments import make, evaluate

# Viz
import seaborn as sns
import matplotlib.pyplot as plt

# Data
import pandas as pd
import numpy as np

import random

# Files
from os import listdir
from os.path import isfile, join
import json


## Test Agents

In [None]:
%%writefile mab.py

# from https://www.kaggle.com/demetrypascal/simple-multi-armed-bandit-ga

import numpy as np

if True:

    bandit_state = None
    total_reward = 0
    last_step = None
    
    def multi_armed_bandit_agent(observation, configuration):
        
        global bandit_state, total_reward, last_step
    
        step = 1# STEPstep #you can regulate exploration / exploitation balacne using this param
        
        decay_rate = 0.97 # how much do we decay the win count after each call
        
            
        if observation.step == 0:
            # initial bandit state
            bandit_state = [[1,1] for i in range(configuration.banditCount)]
        else:       
            # updating bandit_state using the result of the previous step
            last_reward = observation.reward - total_reward
            total_reward = observation.reward
            
            # we need to understand who we are Player 1 or 2
            player = int(last_step == observation.lastActions[1])
            
            if last_reward > 0:
                bandit_state[observation.lastActions[player]][0] += step
            else:
                bandit_state[observation.lastActions[player]][1] += step
            
            bandit_state[observation.lastActions[0]][0] = (bandit_state[observation.lastActions[0]][0] - 1) * decay_rate + 1
            bandit_state[observation.lastActions[1]][0] = (bandit_state[observation.lastActions[1]][0] - 1) * decay_rate + 1
    
    #     generate random number from Beta distribution for each agent and select the most lucky one
        best_proba = -1
        best_agent = None
        for k in range(configuration.banditCount):
            proba = np.random.beta(bandit_state[k][0],bandit_state[k][1])
            if proba > best_proba:
                best_proba = proba
                best_agent = k
            
        last_step = best_agent
        return best_agent

In [None]:
%%writefile vegas.py

# https://www.kaggle.com/jyesawtellrickson/pvsmawrc-commented-explained

import numpy as np
import pandas as pd
import random, os, datetime, math
from collections import defaultdict

total_reward = 0
bandit_dict = {}


def get_next_bandit():
    """Get Next Bandit
    
    Choose the best bandit based on some logics. 
    
    Honestly, don't really understand the logic T.T
    """
    best_bandit = 0
    best_bandit_expected = 0
    
    for bnd in bandit_dict:
        # define some things
        num_wins = bandit_dict[bnd]['win']
        num_losses = bandit_dict[bnd]['loss']
        num_opt_choices = bandit_dict[bnd]['opp']
        num_opt_redraws = bandit_dict[bnd]['op_continue']
        # calculate expectation
        expect = (
            num_wins - num_losses        # subtract the losses?! 
            + num_opt_choices            # add the num draws of opponent
            - (num_opt_choices>0)*1.5    # subtract if opponent has ever drawn (rate up things you've never drawn)
            + num_opt_redraws            # adding number of opt redraws (rate up something that's commonly drawn in a row)
        ) / (
            num_wins + num_losses + num_opt_choices  # divide by total plays
        ) \
        * math.pow(0.97, num_wins + num_losses + num_opt_choices)  # decay
        
        
        # find the best bandit
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
            
    return best_bandit

my_action_list = []
op_action_list = []

op_continue_cnt_dict = defaultdict(int)

def multi_armed_probabilities(observation, configuration):
    """Multi Armed Probabilities
    
    Track the moves and rewards of the game, as well as the repeated actions
    by players.
    
    Logic: 
     1. If you have a successful pull, do that again. (over-exploit)
     2. If you've drawn something 3 times in a row, redo that 50% of the time
     3. Else choose agent based on best estimate of returns
    """
    global total_reward, bandit_dict

    # initialise randomly
    my_pull = random.randrange(configuration['banditCount'])
    
    # update the internal data
    if 0 == observation['step']:
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0, 'my_continue': 0, 'op_continue': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] += 1
        else:
            bandit_dict[my_last_action]['loss'] += 1
        bandit_dict[op_last_action]['opp'] += 1
        
        # if someone redraws the same, then increment
        if observation['step'] >= 3:
            if my_action_list[-1] == my_action_list[-2]:
                bandit_dict[my_last_action]['my_continue'] += 1
            else:
                bandit_dict[my_last_action]['my_continue'] = 0
            if op_action_list[-1] == op_action_list[-2]:
                bandit_dict[op_last_action]['op_continue'] += 1
            else:
                bandit_dict[op_last_action]['op_continue'] = 0
        
        # if I had successful last pull, do that again
        # this probably breaks various follow agents since it's not 'bayesian'
        if last_reward > 0:
            my_pull = my_last_action
        # if I've mad three in a row the same, do it again 50% of the 
        # time, otherwise get the best bandit
        elif observation['step'] >= 4 \
            and (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]) \
            and random.random() < 0.5:
            my_pull = my_action_list[-1]
        # otherwise use bandit choice logics
        else:
            my_pull = get_next_bandit()
    
    return my_pull

## Move History
At each timestep, plot the 100 bandits probabilities, and the moves we played. This is good for seeing the general behaviour of your agents. You can see things like phases of exploration where it expands out to machines with lower thresholds. You can also observe things like following in detail.

We can observe local games (played live), or analyse downloaded games.

In [None]:
def plot_history(mygame, agent1_color='black', agent2_color='cyan'):
    """Plot moves
    
    Plot the moves that agents make across a game.
    
    Agent 1 shown in black, Agent 2 in cyan.
    
    Args:
        mygame: a game file
        agent1_color:
        agent2_color:
    
    """
    data = []
    for i in range(2000):
        data += [mygame[i][0]['observation']['thresholds']]

    data = np.array(data)
    
    # order it
    data = sorted(np.concatenate((
        np.transpose(data),
        np.array(range(100)).reshape(-1,1)
    ), axis=1), key=lambda x: x[0])


    data = np.array(data)

    d = dict(enumerate(data[:,2000:].reshape(-1).tolist()))
    mapping = {int(v):k for k,v in d.items()}

    # Make double
    data = np.concatenate((data, data), axis=1).reshape(-1, 2001)

    sns.set()
    cmap = sns.light_palette("red", as_cmap=True)
    ax = sns.heatmap(data[:,:2000], vmin=20, vmax=100,cmap=cmap)
    # vertical is num moves


    fig = plt.gcf()
    ax = plt.gca()

    # Setup seaborn
    fig.set_size_inches(12, 10)
    ax.set_ylabel('Bandit')
    ax.set_xlabel('Move')
    ax.get_yaxis().set_visible(False)


    # Plot moves
    move_history = [mygame[i][1]['action'] for i in range(2000)]
    ax.scatter(range(2000), [mapping[m]*2+1 for m in move_history],
               color=agent2_color, label='moves', s=3)

    # Plot moves
    move_history = [mygame[i][0]['action'] for i in range(2000)]
    ax.scatter(range(2000), [mapping[m]*2 for m in move_history],
               color=agent1_color, label='moves', s=3)

    plt.show()
    
    
def plot_history_agents(agent1, agent2):
    """Plot History Agents
    
    Run a live game between two saved agents.
    
    agent1: (str) location of agent 1
    agent2: (str) location of agent 2
    """
    # Prepare an environment
    env = make("mab", debug=True)
    env.reset()
    
    mygame = env.run([agent1, agent2])
    
    plot_history(mygame)
    
def plot_history_online(fname):
    
    with open(fname) as f:
        mygame = json.load(f)

    plot_history(mygame['steps'])
    

In [None]:
# Plot a local game
p1 = "mab.py"
p2 = "vegas.py"

plot_history_agents(p1, p2)

In [None]:
# Plot an online game
plot_history_online("../input/santa-2020-top-agents-dataset/episode/11933970.json")

## Threshold Predictions

Here we can look at the predicted thresholds of an agent and see how they vary from the actual thresholds.

To get this to work, the agent must:
- be a class
- have implemented method get_predictions() which returns an array with the predictions for each  bandit (100,)


In the example below, we see that it tends to overestimate the thresholds. This tells us our model needs some adjustments to better understand the actual thresholds (e.g. more decay!). It might also be that accurate predictions of the thresholds isn't required, as we've seen in some public notebook.s

In [None]:
def run_game(agent, opponent):
    """Run Game
    
    Agent must have implemented method get_predictions() which returns an 
    array with the predictions for each  bandit (100,)
    
    agent: (obj) agent 1
    opponent: (str) location of agent 2
    """
    
    env = make("mab", debug=True)

    trainer = env.train([None, opponent])
    observation = trainer.reset()
    configuration = env.configuration
    done = False

    preds = [[0.5 for i in range(100)]]
    actions = []
    opponent_actions = []
    thresholds = [observation.thresholds]

    while not done:
        action = agent(observation, configuration)
        preds += [agent.get_predictions()]
        actions += [action]
        # action = do_action(observation, configuration)
        observation, reward, done, info = trainer.step(action)
        thresholds += [observation.thresholds]
        opponent_actions.append(observation.lastActions[1-observation.agentIndex])

    assert len(thresholds) == len(preds)
    assert len(preds) == 2000
    
    return preds, thresholds, actions, opponent_actions
    

def plot_threshold_vars(agent, opponent, diffs=True):
    """Plot Treshold Variances
    
    Show heatmap of the variance from reality for each bandit
    Put moves over the top.
    
    Args:
        agent: (obj) agent 1
        opponent:
        diffs: (bool) if diffs = True, plot the variance from 
               actual threshold, otherwise plot predicted threshold
    """
    # Run the game
    (preds, thresholds, actions, opponent_actions) = run_game(agent, opponent)
    
    preds = np.array(preds) * 100
    thresholds = np.array(thresholds)
    if diffs:
        var = preds - thresholds
    else:
        var = preds
    # var = np.transpose(var)

    
    var = sorted(np.concatenate((
        np.transpose(var),
        np.array(range(100)).reshape(-1,1)
    ), axis=1), key=lambda x: -x[0])

    var = np.array(var)

    # data = np.array(data)

    d = dict(enumerate(var[:,2000:].reshape(-1).tolist()))
    mapping = {int(v):k for k,v in d.items()}

    # Make double
    # data = np.concatenate((data, data), axis=1).reshape(-1, 2001)

    sns.set()
    cmap = sns.light_palette("red", as_cmap=True)
    # ax = sns.heatmap(data[:,:4000], vmin=20, vmax=100,cmap=cmap)
    if diffs:
        ax = sns.heatmap(var, vmin=-50, vmax=50, cmap="PiYG")
    else:
        ax = sns.heatmap(var, vmin=0, vmax=100, cmap="PiYG")
    # vertical is num moves


    fig = plt.gcf()
    ax = plt.gca()

    # Change seaborn plot size
    fig.set_size_inches(12, 10)
    ax.set_ylabel('Bandit')
    ax.set_xlabel('Move')
    # ax.set_title('Thresholds for Game A v B')

    
    ax.scatter(range(1, 2000), [mapping[m]*2 for m in opponent_actions],
               color='cyan', label='moves', s=10)
    
    ax.scatter(range(1,2000), [mapping[m] for m in actions],
               color='black', label='moves', s=10)

        
    # Agent 1 = black
    # Opponent = cyan


    plt.show()

In [None]:
# %%writefile holmes.py

import numpy as np
from scipy.stats import beta
from random import choice, shuffle, random
from collections import Counter

# Helper functions
class HistoryCollectingAgent():
    def __init__(self):
        self.my_choices = []
        self.my_rewards = []
        self.opponent_choices = []
        self.configuration = None
    
    def __call__(self, obs, conf):
        self.configuration = conf
        if obs.lastActions:
            self.my_choices.append(obs.lastActions[obs.agentIndex])
            self.opponent_choices.append(obs.lastActions[1 - obs.agentIndex])
            self.my_rewards.append(obs.reward - sum(self.my_rewards))
        return self.action(obs, conf)
    
    # abstract method to be implemented in inheriting classes
    def action(self, obs, conf): raise NotImplementedError()


class CCHolmes(HistoryCollectingAgent):
    def __init__(self):
        # Collect history
        HistoryCollectingAgent.__init__(self)

        # Problem Info
        self.decay_rate = 0.97

        # Bayesian stuff
        self.post_a = None
        self.post_b = None
        
        # Behaviour monitoring
        self.opponent_type = None
        self.opponent_type_cd = 0
        self.choice_types = []
        self.opponent_types = []
        self.opponent_post_ab = None
        self.debug = False
        
        # Tuning constants
        self.c = 1.5
        self.opt_fact = 0
    
    def get_predictions(self):
        """
        Bayesian agent
        we estimate:
            P(success) = a / (a+b)

        with decay we know:
            P(success, t) = P(success, t-1) * 0.97 if we drew at t-1

        We should be more abusive of high success rates, we can dial down the 
        strength of the ucb to do this, or manually override.
        """
        
        decay_rate_mod = 1 #.005
        
        if len(self.post_a) > 0:
            # Estimate the thresholds
            threshold_pred = self.post_a / (self.post_a + self.post_b).astype(float)

            # Perform decay
            threshold_pred *= (self.decay_rate*decay_rate_mod) ** (
                        self.post_a # + self.post_b + self.opponent_post_ab - 3
                    ) 

            # Adjust for normal distribution
            # threshold_pred = self.uniform_adjust_2(threshold_pred)
            return threshold_pred
        else:
            return

        
    
    @staticmethod
    def get_maxes(l):
        maxes = []
        max_val = max(l)
        # return choice([i for i in d if i==max_val])
        for i, v in enumerate(l):
            if v >= max_val:
                maxes.append(i)
        return max_val, maxes
        
        
    def decision_logic(self, d):
        (max_val, maxes) = self.get_maxes(d)
        return choice(range(100))
        if len(maxes) == 0:
            return 1
        return choice(maxes)

    
    def update_internals(self):
        """Update Internal Model
        
        Use Bayesian approach.
        
        B(a+1, b+1)
        a = num wins
        b = num losses
        """
        if len(self.my_choices) == 0:
            self.post_a = np.ones(self.configuration.banditCount)
            self.post_b = np.ones(self.configuration.banditCount)
            self.opponent_post_ab = np.ones(self.configuration.banditCount)
            
        else:
            # get the latest reward
            r = self.my_rewards[-1]
            c = self.my_choices[-1]
            # update distribution
            self.post_a[c] += r
            self.post_b[c] += (1 - r)
            self.opponent_post_ab[c] += 1
       
    
    def action(self, observation, configuration):
        """
        """
        # Update any internals
        self.update_internals()
        
        # Estimate the thresholds
        threshold_pred = self.get_predictions()
        
        # Add in the bound (exploration)
        bound =  threshold_pred \
            + beta.std(self.post_a, self.post_b) * self.c

        bandit = int(self.decision_logic(bound))
        
        return bandit
    
    
agent = CCHolmes()

def do_action(observation, configuration):
    return agent(observation, configuration)


In [None]:
plot_threshold_vars(agent, opponent='vegas.py')

In [None]:
plot_threshold_vars(agent, opponent=agent, diffs=False)

## Online Match Score Review

Some agenst might be strongin the early game, and some stronger in the late game. It's instructive to check the rewards in time to see if you're lacking in a particular area.

In [None]:
class Game():
    def __init__(self, fname=''):
        self.fname = fname
        self.move_history = []  # 2000x2
        self.reward_history = []  # 2000x2
        self.threshold_history = []  # 2000x100
        self.total_reward_history = []
        self.teams = []
        
    def process_game(self):
        # load the game
        with open(self.fname) as f:
            j = json.load(f)
        self.teams = j['info']['TeamNames']
        
        move_history = []
        total_reward_history = []
        thresholds = []
        for t in j['steps'][1:]:
            move_history += [[t[0]['action'], t[1]['action']]]
            total_reward_history += [[t[0]['reward'], t[1]['reward']]]
            thresholds += [t[0]['observation']['thresholds']]
        self.move_history = move_history
        reward_history = [total_reward_history[0]]
        for t in range(1, len(total_reward_history)):
            reward_history += [[
                total_reward_history[t][0] - total_reward_history[t-1][0],
                total_reward_history[t][1] - total_reward_history[t-1][1],
            ]]
        
        self.reward_history = reward_history
        self.threshold_history = thresholds
        self.total_reward_history = total_reward_history
        
        assert len(self.reward_history) == j['configuration']['episodeSteps']-1
        assert len(self.move_history) == j['configuration']['episodeSteps']-1
        
        return

    
def load_all_games(folder='../input/santa-2020-top-agents-dataset/episode/'):
    """Load All Games
    
    Load all the games from the games folder.
    """
    mypath = '../input/santa-2020-top-agents-dataset/episode/'
    game_dirs = [folder]
    onlyfiles = []
    for d in game_dirs:
        onlyfiles += [join(d, f) for f in listdir(d) if isfile(join(d, f)) and f[-10:] != '_info.json']
    return onlyfiles


def plot_reward_history(folder='../input/santa-2020-top-agents-dataset/episode/', diffs_trigger=True, lim=30):
    """
    folder name must equal team name
    """

    data = []
    teams = []

    games = load_all_games(folder=folder)[:lim]

    for game in games:

        g = Game(game)
        g.fname
        g.process_game()

        data += [g.total_reward_history]
        teams += [g.teams]

    if diffs_trigger:
        # Calculate differences
        diffs = []
        for i in range(len(teams)):
            if teams[i][0] == folder:
                # We are player one
                diffs += [[p1-p2 for p1,p2 in data[i]]]
            else:
                diffs += [[p2-p1 for p1,p2 in data[i]]]
    else:
        diffs = []
        for i in range(len(teams)):
            if teams[i][0] == folder:
                # We are player one
                diffs += [[p1 for p1,p2 in data[i]]]
            else:
                diffs += [[p2 for p1,p2 in data[i]]]

    # Plot each game
    for i in range(len(diffs)):
        plt.plot(range(1999), diffs[i], color='grey', linewidth=0.4)

    fig = plt.gcf()
    ax = plt.gca()

    # Plot the avg reward on a separate axis
    
    if diffs_trigger:
        ax2 = ax.twinx()
        ax2.plot(range(1999), np.array(diffs).mean(axis=0), color='red')
        # ax2.set_ylabel('Difference in Reward ({} - opponent)'.format(folder))
        ax2.set_ylabel('Difference in Reward (mean)'.format(folder))
    else:
        ax.plot(range(1999), np.array(diffs).mean(axis=0), color='red')
        
    
   

    plt.xlim(0,2000)
    # ax.set_ylabel('Difference in Reward ({} - opponent)'.format(folder))
    ax.set_ylabel('Difference in Reward'.format(folder))
    

In [None]:
plot_reward_history(lim=100)

In [None]:
plot_reward_history(diffs_trigger=False, lim=100)

## Score vs. Board Strength

It's also interesting to consider if you perform better on certain boards, since not all are created equally. Some boards will have a higher threshold to begin with. Agents which favour exploration might be worse off when there are more resources to exploit.

In [None]:
def plot_reward_vs_richness(folder='../input/santa-2020-top-agents-dataset/episode/', lim=30):
    """
    folder name must equal team name
    """

    data = []
    teams = []
    avg_thresholds = []

    games = load_all_games(folder=folder)[:lim]

    for game in games:
        try:
            g = Game(game)
            g.fname
            g.process_game()

            data += [g.total_reward_history]
            teams += [g.teams]
            avg_thresholds += [sum(g.threshold_history[0])/100]
        except:
            True
            # bad game


    diffs = []
    for i in range(len(teams)):
        if teams[i][0] == folder:
            # We are player one
            diffs += [[p1-p2 for p1,p2 in data[i]][-1]]
        else:
            diffs += [[p2-p1 for p1,p2 in data[i]][-1]]


    # print(len(diffs), len(avg_thresholds), diffs, avg_thresholds)
    # Plot each game
    plt.scatter(avg_thresholds, diffs)

    fig = plt.gcf()
    ax = plt.gca()

    # Plot the avg reward on a separate axis
    
    # plt.xlim(0,2000)
    ax.set_ylabel('Difference in Reward')
    

In [None]:
plot_reward_vs_richness(lim=200)