In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install kaggle-environments --upgrade -q

In [None]:
%%writefile random_agent.py
import random

def random_agent(observation, configuration):
    return random.randrange(configuration.banditCount)

In [None]:
%%writefile submission.py

import math

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None

def ucb_agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward

    if observation.step == 0:
        numbers_of_selections = [0] * configuration["banditCount"]
        sums_of_reward = [0] * configuration["banditCount"]

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    bandit = 0
    max_upper_bound = 0
    for i in range(0, configuration.banditCount):
        if (numbers_of_selections[i] > 0):
            average_reward = sums_of_reward[i] / numbers_of_selections[i]
            delta_i = math.sqrt(2 * math.log10(observation.step+1) / numbers_of_selections[i])
            upper_bound = average_reward + delta_i
        else:
            upper_bound = 1e400
        if upper_bound > max_upper_bound and last_bandit != i:
            max_upper_bound = upper_bound
            bandit = i
            last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
from kaggle_environments import make

env = make("mab", debug=True)
env.reset()


In [None]:
env.run(["random_agent.py","submission.py"])
env.render(mode="ipython", width=800, height=800)

In [None]:
# env.reset()
# env.run(["submission.py", "submission.py"])
# env.render(mode="ipython", width=800, height=800)

In [None]:
def check(file1, file2):
    env = make("mab", debug=True)

    for i in range(5):
        env.run([file1, file2])
        p1_score = env.steps[-1][0]['reward']
        p2_score = env.steps[-1][1]['reward']
        env.reset()
        print(f"Round {i+1}: {p1_score} - {p2_score}")



In [None]:
print('Default vs epsilon-greedy')
check("random_agent.py","submission.py")

In [None]:
%%writefile epsilon_greedy.py

import math
import random

epsilon = 0.3

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(50)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                upper_bound = sums_of_reward[i] / numbers_of_selections[i]
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
env.run(["random_agent.py","epsilon_greedy.py"])
env.render(mode="ipython", width=800, height=500)

In [None]:
print('Default vs epsilon-greedy')
check("random_agent.py","epsilon_greedy.py")

In [None]:
%%writefile epsilon_greedy_decay.py

import math
import random

epsilon = 0.3

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(50)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                decay=0.97**numbers_of_selections[i]
                upper_bound = decay*(sums_of_reward[i] / numbers_of_selections[i])
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
print('Default vs epsilon-greedy_decay')
check("random_agent.py","epsilon_greedy_decay.py")

In [None]:
%%writefile ucb_and_epsilon_greedy_decay.py

import math
import random

epsilon = 0.1

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(50)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                decay=0.97**numbers_of_selections[i]
                delta_i = math.sqrt(2 * math.log10(observation.step+1) / numbers_of_selections[i])
                upper_bound = decay*(sums_of_reward[i] / numbers_of_selections[i])+ delta_i
           
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
print('Default vs ucb_and_epsilon-greedy_decay')
check("random_agent.py","ucb_and_epsilon_greedy_decay.py")

In [None]:
%%writefile ucb_and_epsilon_greedy_decay2.py

import math
import random

epsilon = 0.2

last_bandit = -1
total_reward = 0

sums_of_reward = None
numbers_of_selections = None
random.seed(50)

def agent(observation, configuration):    
    global sums_of_reward, numbers_of_selections, last_bandit, total_reward

    if observation.step == 0:
        numbers_of_selections = [0] * configuration.banditCount
        sums_of_reward = [0] * configuration.banditCount

    if last_bandit > -1:
        reward = observation.reward - total_reward
        sums_of_reward[last_bandit] += reward
        total_reward += reward

    if random.random() < epsilon:
        bandit = random.randint(0, configuration.banditCount-1)
        last_bandit = bandit
    else:
        bandit = 0
        max_upper_bound = 0

        for i in range(0, configuration.banditCount):
            if numbers_of_selections[i] > 0:
                decay=0.97**numbers_of_selections[i]
                delta_i = math.sqrt(2 * math.log10(observation.step+1) / numbers_of_selections[i])
                upper_bound = max(decay*(sums_of_reward[i] / numbers_of_selections[i]),
                                  sums_of_reward[i] / numbers_of_selections[i] + delta_i)
           
            else:
                upper_bound = 1e400
            if upper_bound > max_upper_bound and last_bandit != i:
                max_upper_bound = upper_bound
                bandit = i
                last_bandit = bandit

    numbers_of_selections[bandit] += 1

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
print('Default vs ucb_and_epsilon-greedy_decay2')
check("random_agent.py","ucb_and_epsilon_greedy_decay2.py")

In [None]:
%%writefile ucb_bayesian.py

import numpy as np
from scipy.stats import beta

epsilon = 0.1
decay=0.97
bandit=0
total_reward = 0
a,b,bound=[None]*3
c=2

def agent(observation, configuration):    
    global total_reward,a,b,bandit

    if observation.step == 0:
        a=[1]*configuration.banditCount
        b=[1]*configuration.banditCount
    else:
        r = decay*(observation.reward - total_reward)
        total_reward = observation.reward 
    
        bandit = max(
            range(configuration.banditCount),
            key=lambda x: a[x] / float(a[x] + b[x]) + beta.std(
                a[x], b[x]) * c
        )
  
        a[bandit] += r
        b[bandit] += 1 - r
        

    if bandit is None:
        bandit = 0

    return bandit

In [None]:
print('Default vs ucb_bayesian')
check("random_agent.py","ucb_bayesian.py")

In [None]:
%%writefile thompson.py

import numpy as np
from scipy.stats import beta

epsilon = 0.1
decay=0.97
bandit=0
total_reward = 0
a,b,bound=[None]*3
c=4

def agent(observation, configuration):    
    global total_reward,a,b,bandit

    if observation.step == 0:
        a=[1]*configuration.banditCount
        b=[1]*configuration.banditCount
    else:
        r = decay*(observation.reward - total_reward)
        total_reward = observation.reward 
    
        bandit = max(
            range(configuration.banditCount),
            key=lambda x: a[x] / float(a[x] + b[x]) + beta.std(
                a[x], b[x]) * c
        )
  
        a[bandit] += r
        b[bandit] += 1 - r
        

    if bandit is None:
        bandit = 0

    return bandit