# Keep pulling same bandit as long as reward keeps coming!

Cleaned up a bit and added coments / explanations for functions.

Full credit to Lindada's notebook.
Notebook: https://www.kaggle.com/a763337092/pull-vegas-slot-machines-add-weaken-rate-continue5
Kaggler: https://www.kaggle.com/a763337092

In [None]:
!pip install kaggle-environments --upgrade

In [None]:
%%writefile submission.py


import numpy as np
import pandas as pd
import random, os, datetime, math
from collections import defaultdict

total_reward = 0
bandit_dict = {}


def get_next_bandit():
    """Get Next Bandit
    
    Choose the best bandit based on some logics. 
    
    Honestly, don't really understand the logic T.T
    """
    best_bandit = 0
    best_bandit_expected = 0
    
    for bnd in bandit_dict:
        # define some things
        num_wins = bandit_dict[bnd]['win']
        num_losses = bandit_dict[bnd]['loss']
        num_opt_choices = bandit_dict[bnd]['opp']
        num_opt_redraws = bandit_dict[bnd]['op_continue']
        # calculate expectation
        expect = (
            num_wins - num_losses        # subtract the losses?! 
            + num_opt_choices            # add the num draws of opponent
            - (num_opt_choices>0)*1.5    # subtract if opponent has ever drawn (rate up things you've never drawn)
            + num_opt_redraws            # adding number of opt redraws (rate up something that's commonly drawn in a row)
        ) / (
            num_wins + num_losses + num_opt_choices  # divide by total plays
        ) \
        * math.pow(0.97, num_wins + num_losses + num_opt_choices)  # decay
        
        
        # find the best bandit
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
            
    return best_bandit

my_action_list = []
op_action_list = []

op_continue_cnt_dict = defaultdict(int)

def multi_armed_probabilities(observation, configuration):
    """Multi Armed Probabilities
    
    Track the moves and rewards of the game, as well as the repeated actions
    by players.
    
    Logic: 
     1. If you have a successful pull, do that again. (over-exploit)
     2. If you've drawn something 3 times in a row, redo that 50% of the time
     3. Else choose agent based on best estimate of returns
    """
    global total_reward, bandit_dict

    # initialise randomly
    my_pull = random.randrange(configuration['banditCount'])
    
    # update the internal data
    if 0 == observation['step']:
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0, 'my_continue': 0, 'op_continue': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] += 1
        else:
            bandit_dict[my_last_action]['loss'] += 1
        bandit_dict[op_last_action]['opp'] += 1
        
        # if someone redraws the same, then increment
        if observation['step'] >= 3:
            if my_action_list[-1] == my_action_list[-2]:
                bandit_dict[my_last_action]['my_continue'] += 1
            else:
                bandit_dict[my_last_action]['my_continue'] = 0
            if op_action_list[-1] == op_action_list[-2]:
                bandit_dict[op_last_action]['op_continue'] += 1
            else:
                bandit_dict[op_last_action]['op_continue'] = 0
        
        # if I had successful last pull, do that again
        # this probably breaks various follow agents since it's not 'bayesian'
        if last_reward > 0:
            my_pull = my_last_action
        # if I've mad three in a row the same, do it again 50% of the 
        # time, otherwise get the best bandit
        elif observation['step'] >= 4 \
            and (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]) \
            and random.random() < 0.5:
            my_pull = my_action_list[-1]
        # otherwise use bandit choice logics
        else:
            my_pull = get_next_bandit()
    
    return my_pull

In [None]:
%%writefile opponent_agent.py

import numpy as np
import pandas as pd
import random, os, datetime, math

total_reward = 0
bandit_dict = {}

def set_seed(my_seed=42):
    os.environ['PYTHONHASHSEED'] = str(my_seed)
    random.seed(my_seed)
    np.random.seed(my_seed)

def get_next_bandit():
    best_bandit = 0
    best_bandit_expected = 0
    for bnd in bandit_dict:
        expect = (bandit_dict[bnd]['win'] - bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'] - (bandit_dict[bnd]['opp']>0)*1.5) \
                 / (bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp']) \
                * math.pow(0.97, bandit_dict[bnd]['win'] + bandit_dict[bnd]['loss'] + bandit_dict[bnd]['opp'])
        if expect > best_bandit_expected:
            best_bandit_expected = expect
            best_bandit = bnd
    return best_bandit

my_action_list = []
op_action_list = []



def multi_armed_probabilities(observation, configuration):
    global total_reward, bandit_dict

    my_pull = random.randrange(configuration['banditCount'])
    if 0 == observation['step']:
        set_seed()
        total_reward = 0
        bandit_dict = {}
        for i in range(configuration['banditCount']):
            bandit_dict[i] = {'win': 1, 'loss': 0, 'opp': 0}
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        
        if 0 < last_reward:
            bandit_dict[my_last_action]['win'] = bandit_dict[my_last_action]['win'] +1
        else:
            bandit_dict[my_last_action]['loss'] = bandit_dict[my_last_action]['loss'] +1
        bandit_dict[op_last_action]['opp'] = bandit_dict[op_last_action]['opp'] +1
        
        if last_reward > 0:
            my_pull = my_last_action
        else:
            if observation['step'] >= 4:
                if (my_action_list[-1] == my_action_list[-2]) and (my_action_list[-1] == my_action_list[-3]):
                    if random.random() < 0.5:
                        my_pull = my_action_list[-1]
                    else:
                        my_pull = get_next_bandit()
                else:
                    my_pull = get_next_bandit()
            else:
                my_pull = get_next_bandit()
    
    return my_pull

In [None]:
from kaggle_environments import make
env = make("mab", debug=True)

In [None]:
import datetime

env.reset()
start_time = datetime.datetime.now()
env.run(["opponent_agent.py", "submission.py"])
stop_time = datetime.datetime.now()
print('Completed agent vs new model:', stop_time-start_time)
env.render(mode="ipython", width=800, height=400)

In [None]:
import datetime

env.reset()
start_time = datetime.datetime.now()
env.run(["submission.py", "submission.py"])
stop_time = datetime.datetime.now()
print('Completed sub vs sub:', stop_time-start_time)
env.render(mode="ipython", width=800, height=400)