# My Thompson Sampling Agent

In [None]:
%%writefile submission.py

# Thompson Sampling (beta distribution) with persistent (choose same bandit if prior step  won)
import numpy as np
from random import randrange
from scipy.stats import beta

ps_a = None
post_b = None
bandit = None
decay = None
total_reward = 0

DECAY_RATE = 0.95


def agent(observation, configuration):
    global reward_sums, total_reward, bandit, post_a, post_b, decay
    
    n_bandits = configuration.banditCount
    r = 0
    
    if observation.step == 0:
        post_a = np.ones(n_bandits)
        post_b = np.ones(n_bandits)
        decay = np.ones(n_bandits)
        bandit = randrange(n_bandits)  # first step choose random
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward
        
        decay[observation.lastActions[0]] *= DECAY_RATE
        decay[observation.lastActions[1]] *= DECAY_RATE

        post_a[bandit] += r + decay[bandit]
        post_b[bandit] += (1 - r)
    
    if r == 0:  # if prior step lost - choose from beta districution. Else choose the same bandit again
        bound = beta.mean(post_a, post_b) + beta.std(post_a, post_b) * 3
        bandit = int(np.argmax(bound))
    
    return bandit

In [None]:
%%writefile bay_sub_decay.py
# My first submission

import numpy as np
from scipy.stats import beta

ps_a = None
post_b = None
bandit = None
decay = None
total_reward = 0

DECAY_RATE = 0.95


def agent(observation, configuration):
    global reward_sums, total_reward, bandit, post_a, post_b, decay
    
    n_bandits = configuration.banditCount

    if observation.step == 0:
        post_a = np.ones(n_bandits)
        post_b = np.ones(n_bandits)
        decay = np.ones(n_bandits)
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward
        
        decay[observation.lastActions[0]] *= DECAY_RATE
        decay[observation.lastActions[1]] *= DECAY_RATE

        post_a[bandit] += r + decay[bandit]
        post_b[bandit] += (1 - r)

    bound = beta.mean(post_a, post_b) + beta.std(post_a, post_b) * 3
    bandit = int(np.argmax(bound))
    
    return bandit

In [None]:
%%writefile bay_sub_decay2.py
# My second submission

# Thompson Sampling (beta distribution) with persistent (choose same bandit if prior step  won)
import numpy as np
from random import randrange
from scipy.stats import beta

ps_a = None
post_b = None
bandit = None
decay = None
total_reward = 0

DECAY_RATE = 0.95


def agent(observation, configuration):
    global reward_sums, total_reward, bandit, post_a, post_b, decay
    
    n_bandits = configuration.banditCount
    r = 0
    
    if observation.step == 0:
        post_a = np.ones(n_bandits)
        post_b = np.ones(n_bandits)
        decay = np.ones(n_bandits)
        bandit = randrange(n_bandits)  # first step choose random
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward
        
        decay[observation.lastActions[0]] *= DECAY_RATE
        decay[observation.lastActions[1]] *= DECAY_RATE

        post_a[bandit] += r + decay[bandit]
        post_b[bandit] += (1 - r)
    
    if r == 0:  # if prior step lost - choose from beta districution. Else choose the same bandit again
        bound = beta.mean(post_a, post_b) + beta.std(post_a, post_b) * 3
        bandit = int(np.argmax(bound))
    
    return bandit

In [None]:
%%writefile thompson_decay1.py

## Thompson Sampling Algorithm with decay

from scipy.stats import beta

last_bandit = -1
total_reward = 0
DECAY_RATE = 0.99

numbers_of_rewards_0 = None
numbers_of_rewards_1 = None
chosen_bandits = None

def thompson_agent_decay1(observation, configuration):    
    global numbers_of_rewards_1, numbers_of_rewards_0, last_bandit, total_reward, decay
    
    d = configuration.banditCount

    if observation.step == 0:
        numbers_of_rewards_1 = [0] * d
        numbers_of_rewards_0 = [0] * d
        decay = [1] * d
    else:
        decay[observation.lastActions[0]] *= DECAY_RATE
        decay[observation.lastActions[1]] *= DECAY_RATE

        reward = observation.reward - total_reward
        total_reward += reward
        
        numbers_of_rewards_1[last_bandit] += reward
        numbers_of_rewards_0[last_bandit] += (1 - reward)
        

    bandit = 0
    max_beta = 0
    for i in range(0, d):
        a = 1 + numbers_of_rewards_1[i] * decay[i]
        b = numbers_of_rewards_0[i] + 1
        beta_dist = beta(a, b)
        beta_val = beta_dist.mean() 
        if beta_val > max_beta:
            max_beta = beta_val
            bandit = i
            last_bandit = bandit   
    
    #print (f"last action is {observation.lastActions}. max_radnom is {max_random}. bandit chosen {chosen_bandits[bandit]} times.")
    
    return bandit

In [None]:
%%writefile thompson_decay2.py

from scipy.stats import beta

last_bandit = -1
total_reward = 0
DECAY_RATE = 0.98

numbers_of_rewards_0 = None
numbers_of_rewards_1 = None
chosen_bandits = None

def thompson_agent_decay2(observation, configuration):    
    global numbers_of_rewards_1, numbers_of_rewards_0, last_bandit, total_reward, decay
    
    d = configuration.banditCount

    if observation.step == 0:
        numbers_of_rewards_1 = [0] * d
        numbers_of_rewards_0 = [0] * d
        decay = [1] * d
    else:
        decay[observation.lastActions[0]] *= DECAY_RATE
        decay[observation.lastActions[1]] *= DECAY_RATE

        reward = observation.reward - total_reward
        total_reward += reward
        
        numbers_of_rewards_1[last_bandit] += reward
        numbers_of_rewards_0[last_bandit] += (1 - reward)
        

    bandit = 0
    max_beta = 0
    for i in range(0, d):
        a = numbers_of_rewards_1[i] + decay[i]
        b = numbers_of_rewards_0[i] + 1
        beta_dist = beta(a, b)
        beta_val = beta_dist.mean()
        if beta_val > max_beta:
            max_beta = beta_val
            bandit = i
            last_bandit = bandit   
    
    #print (f"last action is {observation.lastActions}. max_radnom is {max_random}. bandit chosen {chosen_bandits[bandit]} times.")
    
    return bandit

# Other agents for competition

In [None]:
%%writefile random_agent.py

import random

def random_agent(observation, configuration):
    return random.randrange(configuration.banditCount)

In [None]:
%%writefile always_first_agent.py

def always_first(observation, configuration):
    return 0

In [None]:
%%writefile ucb_agent.py
## Upper Confidence Bound

import math

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None

def ucb_agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward

    if observation.step == 0:
        numbers_of_selections = [0] * configuration["banditCount"]
        sums_of_reward = [0] * configuration["banditCount"]

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    bandit = 0
    max_upper_bound = 0
    for i in range(0, configuration.banditCount):
        if (numbers_of_selections[i] > 0):
            average_reward = sums_of_reward[i] / numbers_of_selections[i]
            delta_i = math.sqrt(2 * math.log(observation.step+1) / numbers_of_selections[i])
            upper_bound = average_reward + delta_i
        else:
            upper_bound = 1e400
        if upper_bound > max_upper_bound and last_bandit != i:
            max_upper_bound = upper_bound
            bandit = i
            last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
%%writefile bay_sub.py

# Thompson sampling

import numpy as np
from scipy.stats import beta

ps_a = None
post_b = None
bandit = None
total_reward = 0


def agent(observation, configuration):
    global reward_sums, total_reward, bandit, post_a, post_b
    
    n_bandits = configuration.banditCount

    if observation.step == 0:
        post_a = np.ones(n_bandits)
        post_b = np.ones(n_bandits)
    else:
        r = observation.reward - total_reward
        total_reward = observation.reward

        post_a[bandit] += r + (1 - observation.step / 2000)
        post_b[bandit] += (1 - r)

    
    bound = post_a / (post_a + post_b).astype(float) + beta.std(post_a, post_b) * 3
    bandit = int(np.argmax(bound))
    
    return bandit

## Run competitions

In [None]:
! pip install --upgrade pip
! pip install kaggle-environments --upgrade -q

In [None]:
from kaggle_environments import make

env = make("mab", debug=True)

env.run(["bay_sub1.py", "agent2.py"])
env.render(mode="ipython", width=800, height=800)