<h1> A Beginner's Approach to Performance Ananlysis</h1>

Welcome to my first notebook! <br><br> As a beginner, most of the discussions and the math went right over my head. I found [this wonderful notebook](https://www.kaggle.com/sirishks/pull-vegas-slot-machines) by [sirishks](https://www.kaggle.com/sirishks), with a fairly easy-to-understand strategy. I made a few agents based off of this, but wanted a way to properly compare. <br> <br> <br> Keeping in mind that the main goal of our agent is to pick the vending machine with the highest chances of giving us a candy, I tried to graph the chances chosen by our agents with the maximum chance possible at that step. <br><br>
**Changes in the V3**:

* Now we have a subplot comparing the accuracy of the thresholds expected by the agents.
* We run a match against an agent returning only 0, so we have a better idea on how accurately our agents can predict thresholds without an opponent interfering.

In [None]:
!pip install kaggle-environments --upgrade -q

from kaggle_environments import make, evaluate
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

env = make("mab", debug=True)


In [None]:
def divide_steps(game, ind):
  red_steps, green_steps, blue_steps = [], [], []
  for i in range(2000):
    pres_prob = game[i][0].observation.thresholds[game[i][ind].action]
    sorted_thres = np.sort(game[i][0].observation.thresholds)
    if pres_prob >= sorted_thres[-30]:
      green_steps.append(i)
    elif pres_prob >= sorted_thres[-60]:
      blue_steps.append(i)
    else:
      red_steps.append(i)
  return red_steps, blue_steps, green_steps

<h2 style='background:#FBE338; border:0; color:black'><center>Random Agent<center><h2>



In [None]:
%%writefile random.py

import random

def agent(obs, conf):
    return random.randrange(conf.banditCount)

<h2 style='background:#FBE338; border:0; color:black'><center>Return 0 Agent<center><h2>



In [None]:
%%writefile return0.py

import random

def agent(obs, conf):
    return 0

<h2 style='background:#FBE338; border:0; color:black'><center>Vegas Slot Pulls Agent<center></h2>
    
Copied from [here.](https://www.kaggle.com/sirishks/pull-vegas-slot-machines)

In [None]:
%%writefile vegas_pull.py

import numpy as np
import pandas as pd
import random, os, datetime

total_reward = 0
bandit_dict = {}
df_probs = []
df_steps = []

def set_seed(my_seed=42):
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit(step):
    global df_steps, df_probs
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*1.5) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    df_steps.append(step)
    df_probs.append(best_bandit_expected*100)
    return best_bandit

def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict
    global df_steps, df_probs
    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        if 0 < last_reward:
            bandit_dict[observation['lastActions'][my_idx]]['win'] = bandit_dict[observation['lastActions'][my_idx]]['win'] +1
        else:
            bandit_dict[observation['lastActions'][my_idx]]['loss'] = bandit_dict[observation['lastActions'][my_idx]]['loss'] +1
        bandit_dict[observation['lastActions'][1-my_idx]]['opp'] = bandit_dict[observation['lastActions'][1-my_idx]]['opp'] +1
        my_pull = get_next_bandit(observation.step)
    if observation.step > 1995:
        df = pd.DataFrame(df_probs, index=df_steps, columns=['probs'])
        df.to_csv("vegas_pull.csv")
    return my_pull

In [None]:
agent = "vegas_pull"
mygame = env.run(["random.py", f"{agent}.py"])


fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(16,12))

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax1.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax1.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax1.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax1.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax1.set_title("vs Random agent: Chances of getting a candy")
ax1.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax2.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax2.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax2.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax2.set_title("Thresholds as expected by the agent")
ax2.legend(loc='upper right')

mygame = env.run(["return0.py", f"{agent}.py"])

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax3.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax3.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax3.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax3.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax3.set_title("vs Return0 agent: Chances of getting a candy")
ax3.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax4.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax4.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax4.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax4.set_title("Thresholds as expected by the agent")
ax4.legend(loc='upper right')


As you can see, Vegas Pull doesn't do quite well if the opponent picks a lot of wrong machines because of:

>expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] **+ bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*1.5)** /(bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])

But it won't quite matter in real matches I think, if your opponent is making such bad choices, vegas_pull.py is gonna win anyway.

As for normal matches:
In the steps 100-500~, this agent has one of the best intersections between the expected and the actual thresholds. And keep in mind, first few hundred steps are the time you want to suck off the big bandits, and this agent does the best job at that, no wonder it's on the top of the leaderboard.

<h2 style='background:#FBE338; border:0; color:black'><center>UCB With Decay<center></h2>
    
Copied from [here](https://www.kaggle.com/xhlulu/santa-2020-ucb-and-bayesian-ucb-starter)

In [None]:
%%writefile ucb_decay.py

import numpy as np
import pandas as pd

decay = 0.97
total_reward = 0
bandit = None
df_steps = []
df_probs = []

def agent(observation, configuration):
    global reward_sums, n_selections, total_reward, bandit
    global df_steps, df_probs
    n_bandits = configuration.banditCount

    if observation.step == 0:
        n_selections, reward_sums = np.full((2, n_bandits), 1e-32)
    else:
        reward_sums[bandit] += decay * (observation.reward - total_reward)
        total_reward = observation.reward

    avg_reward = reward_sums / n_selections    
    delta_i = np.sqrt(2 * np.log(observation.step + 1) / n_selections)
    bandit = int(np.argmax(avg_reward + delta_i))
    
    best_avg_reward = 1
    best_delta_i = np.sqrt(2 * np.log(observation.step + 1) / n_selections[bandit])
    best_chance = best_avg_reward + best_delta_i
    # I'm NOT sure of the above math, corrections if any, would be very welcome.
    # What I understood and learnt is, we are creating an 'interval'
    # as to how unlucky/lucky the agent could be, as:
    # [rewards(n)/num_called(n) - root:c*log(n)/num_called(n), rewards(n)/num_called(n) + root:c*log(n)/num_called(n)]
    # as n increases, delta further decreases, and the interval gets smaller,
    # hence further increasing our 'confidence' on the agent.
    # To convert it into a % , I divided the chosen bandits upper confidence level with the
    # best possible at that step for that bandit...
    df_steps.append(observation.step)
    df_probs.append(100 * np.max(avg_reward + delta_i) / best_chance)
    
    if observation.step > 1995:
        df = pd.DataFrame(df_probs, index=df_steps, columns=['probs'])
        df.to_csv("ucb_decay.csv")

    n_selections[bandit] += 1

    return bandit

In [None]:
agent = "ucb_decay"
mygame = env.run(["random.py", f"{agent}.py"])


fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(16,12))

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax1.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax1.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax1.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax1.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax1.set_title("vs Random agent: Chances of getting a candy")
ax1.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax2.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax2.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax2.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax2.set_title("Thresholds as expected by the agent")
ax2.legend(loc='upper right')

mygame = env.run(["return0.py", f"{agent}.py"])

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax3.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax3.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax3.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax3.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax3.set_title("vs Return0 agent: Chances of getting a candy")
ax3.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax4.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax4.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax4.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax4.set_title("Thresholds as expected by the agent")
ax4.legend(loc='upper right')


The UCB decay agent seems to overestimate the thresholds by a lot, but at the same time, no other agent so regularly makes as many decisions so close to the best-threshold line. This agent could perform magnitudes better if the number of decisions below the 50% are further decreased, which I am very interested in working towards.

<h2 style='background:#FBE338; border:0; color:black'><center>Bayesian UCB<center></h2>
    
Copied from [here](https://www.kaggle.com/xhlulu/santa-2020-ucb-and-bayesian-ucb-starter)

In [None]:
%%writefile bayesian_ucb.py

import numpy as np
from scipy.stats import beta
import pandas as pd

post_a, post_b, bandit = [None] * 3
total_reward = 0
c = 3
df_steps = []
df_probs = []

def agent(observation, configuration):
    global total_reward, bandit, post_a, post_b, c
    global df_steps, df_probs
    if observation.step == 0:
        post_a, post_b = np.ones((2, configuration.banditCount))
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward
        # Update Gaussian posterior
        post_a[bandit] += r
        post_b[bandit] += 1 - r
    
    bound = post_a / (post_a + post_b) + beta.std(post_a, post_b) * c
    bandit = int(np.argmax(bound))
    
    # Again, like with the ucb-decay, this method creates an interval, and chooses
    # a bandit which could be the luckiest. For this,they have used population standard deviation for deciding the interval width.
    # We are going to do the same as before i.e, create an ideal bandit and compare the Upper bounds for the expected %.
    best_a, best_b = np.array(post_a), np.array(post_b)
    best_a[bandit] += best_b[bandit] - 1
    best_b[bandit] = 1 #created an ideal agent.
    best_bound = best_a / (best_a + best_b) + beta.std(best_a, best_b) * c
    
    df_probs.append(100 * np.max(bound) / best_bound[bandit])
    df_steps.append(observation.step)
    
    if observation.step > 1995:
        df = pd.DataFrame(df_probs, index=df_steps, columns=['probs'])
        df.to_csv("bayesian_ucb.csv")
    return bandit

In [None]:
agent = "bayesian_ucb"
mygame = env.run(["random.py", f"{agent}.py"])


fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(16,12))

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax1.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax1.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax1.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax1.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax1.set_title("vs Random agent: Chances of getting a candy")
ax1.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax2.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax2.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax2.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax2.set_title("Thresholds as expected by the agent")
ax2.legend(loc='upper right')

mygame = env.run(["return0.py", f"{agent}.py"])

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax3.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax3.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax3.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax3.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax3.set_title("vs Return0 agent: Chances of getting a candy")
ax3.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax4.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax4.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax4.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax4.set_title("Thresholds as expected by the agent")
ax4.legend(loc='upper right')


The agent seems to heavily overestimate the thresholds, and I'm very much doubting my math and logic on this. But at the same time, there seem to be similarities in the graphs, and the percentages aren't really breaking any rules, so I'm confused. Feedback would really be appreciated. As of yet, the best agent in terms of keeping it's decisions near and above the 50% threshold mark. Fills exactly where the ucb_decay lacked, maybe a hybrid agent would do the job? :P

<h2 style='background:#FBE338; border:0; color:black'><center>Simple Multi-armed Bandit<center></h2>
    
Copied from [here](https://www.kaggle.com/ilialar/simple-multi-armed-bandit)

In [None]:
%%writefile multi_armed_bandit.py

import json
import numpy as np
import pandas as pd

bandit_state = None
total_reward = 0
last_step = None
df_steps = []
df_probs = []

def multi_armed_bandit_agent (observation, configuration):
    global history, history_bandit
    global df_steps, df_probs
    step = 1.0 #you can regulate exploration / exploitation balacne using this param
    decay_rate = 0.97 # how much do we decay the win count after each call
    
    global bandit_state,total_reward,last_step
        
    if observation.step == 0:
        # initial bandit state
        bandit_state = [[1,1] for i in range(configuration["banditCount"])]
    else:       
        # updating bandit_state using the result of the previous step
        last_reward = observation["reward"] - total_reward
        total_reward = observation["reward"]
        
        # we need to understand who we are Player 1 or 2
        player = int(last_step == observation.lastActions[1])
        
        if last_reward > 0:
            bandit_state[observation.lastActions[player]][0] += last_reward * step
        else:
            bandit_state[observation.lastActions[player]][1] += step
        
        bandit_state[observation.lastActions[0]][0] = (bandit_state[observation.lastActions[0]][0] - 1) * decay_rate + 1
        bandit_state[observation.lastActions[1]][0] = (bandit_state[observation.lastActions[1]][0] - 1) * decay_rate + 1

#     generate random number from Beta distribution for each agent and select the most lucky one
    best_proba = -1
    best_agent = None
    for k in range(configuration["banditCount"]):
        proba = np.random.beta(bandit_state[k][0],bandit_state[k][1])
        if proba > best_proba:
            best_proba = proba
            best_agent = k
    
    df_steps.append(observation.step)
    df_probs.append(best_proba * 100)
    if observation.step > 1995:
        df = pd.DataFrame(df_probs, index=df_steps, columns=['probs'])
        df.to_csv("multi_armed_bandit.csv")
    last_step = best_agent
    return best_agent

In [None]:
import numpy as np
agent = "multi_armed_bandit"
mygame = env.run(["random.py", f"{agent}.py"])


fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(16,12))

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax1.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax1.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax1.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax1.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax1.set_title("vs Random agent: Chances of getting a candy")
ax1.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax2.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax2.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax2.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax2.set_title("Thresholds as expected by the agent")
ax2.legend(loc='upper right')

mygame = env.run(["return0.py", f"{agent}.py"])

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax3.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax3.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax3.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax3.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax3.set_title("vs Return0 agent: Chances of getting a candy")
ax3.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax4.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax4.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax4.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax4.set_title("Thresholds as expected by the agent")
ax4.legend(loc='upper right')


I feel bad for saying the previous agents were overestimating the probabilities, looks like they all do. It's understandable that there are only a few(20~) steps per bandit, but considering these agents target only the best bandits, I was expecting they atleast come close in the last few hundred steps, but doesn't seem to be the case.

<h2 style='background:#FBE338; border:0; color:black'><center>Thompson Sampling<center></h2>
    
Copied from [here](https://www.kaggle.com/ilialar/simple-multi-armed-bandit)

In [None]:
%%writefile thompson.py

import numpy as np
import pandas as pd

post_a = None
post_b = None
bandit = None
total_reward = 0
c = 3
df_steps = []
df_probs = []

def agent(observation, configuration):
    global reward_sums, total_reward, bandit, post_a, post_b, c
    global df_steps, df_probs
    
    n_bandits = configuration.banditCount

    if observation.step == 0:
        post_a = np.ones(n_bandits)
        post_b = np.ones(n_bandits)
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward

        # Update Gaussian posterior
        post_a[bandit] += r
        post_b[bandit] += (1 - r)

    samples = np.random.beta(post_a, post_b)
    bandit = int(np.argmax(samples))
    
    df_steps.append(observation.step)
    df_probs.append(samples[bandit] * 100)
    if observation.step > 1995:
        df = pd.DataFrame(df_probs, index=df_steps, columns=['probs'])
        df.to_csv("thompson.csv")
        
    return bandit


In [None]:
import numpy as np
agent = "thompson"
mygame = env.run(["random.py", f"{agent}.py"])


fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(16,12))

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax1.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax1.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax1.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax1.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax1.set_title("vs Random agent: Chances of getting a candy")
ax1.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax2.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax2.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax2.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax2.set_title("Thresholds as expected by the agent")
ax2.legend(loc='upper right')

mygame = env.run(["return0.py", f"{agent}.py"])

red_steps, blue_steps, green_steps = divide_steps(mygame, 1)
ax3.scatter(green_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in green_steps], color='green', label='Chosen bandit ranks 1-30th thresold-wise')
ax3.scatter(blue_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in blue_steps], color='blue', label='Chosen bandit ranks 31-60th thresold-wise')
ax3.scatter(red_steps, [mygame[i][0].observation.thresholds[mygame[i][1].action] for i in red_steps], color='red', label='Chosen bandit ranks 61-100th threshold-wise')
ax3.plot([x for x in range(2000)], [max(mygame[i][0].observation.thresholds) for i in range(2000)], 'C1', label='Maximum chances of getting a candy at that step')
ax3.set_title("vs Return0 agent: Chances of getting a candy")
ax3.legend(loc='upper right')

df = pd.read_csv(f"{agent}.csv", index_col=[0])
red_steps, blue_steps, green_steps = [i for i in red_steps if i in df.index], [i for i in blue_steps if i in df.index], [i for i in green_steps if i in df.index]
ax4.scatter(green_steps, df.loc[green_steps, 'probs'], color='green', label='Actual threshold ranks 1-30th')
ax4.scatter(blue_steps, df.loc[blue_steps, 'probs'], color='blue', label='Actual threshold ranks 31-60th')
ax4.scatter(red_steps, df.loc[red_steps, 'probs'], color='red', label='Actual threshold ranks 61-100th')
ax4.set_title("Thresholds as expected by the agent")
ax4.legend(loc='upper right')


Much like the multi-armed-bandit, which is understandable, because well their math is the same, just that the multi_armed_bandit decays it's rewards. More agents to be added in the next version!