This notebook shows how to use multi-armed bandit. In this notebook, I will present softmax strategy and upper confidence bound strategy for selecting an action.
If you find it helpful, give it a vote :3

In [None]:
!pip install kaggle-environments --upgrade

In [None]:
%%writefile submission.py

import numpy as np
from scipy import stats
import random

class Bandit:
  def __init__(self, k=10, eps=0.2, lr=0.1, ucb=False, soft_max=False, c=2):
    """
    k: the number of bandits
    eps: e-greedy parameter
    lr: step size in the incremental formula
    ucb: upper confident bound
    c: a parameter of ucb
    """
    self.k = k
    self.eps = eps
    self.lr = lr
    self.initial_values = [] #optimistic initial value of each arm
    for i in range(self.k):
      self.initial_values.append(np.random.randn() + 1) #normal distribution
    #for ucb
    self.ucb = ucb
    self.times = 0
    self.c = c
    #for softmax action selection
    self.soft_max = soft_max

    #columns: Observation and avg reward
    self.record = np.zeros((self.k, 2))
    
    #total reward
    self.total_reward = 0
    
  def get_reward(self, observation):
    no_reward_step = 0.3
    last_reward = observation["reward"] - self.total_reward
    self.total_reward = observation["reward"]
    
    if last_reward > 0:
        return last_reward
    return no_reward_step
  
  def update_record(self, action, r):
    #update avg reward using incremental formula
#     self.record[action, 1] += self.lr*(r-self.record[action, 1])
    #update avg reward using original fomular
    new_avg_reward = (self.record[action, 0] * self.record[action, 1] + r) / (self.record[action, 0]+1)
    self.record[action, 1] = new_avg_reward
    #update observations
    self.record[action, 0] += 1
  
  def softmax(self, av, tau=1.12):
    softm = np.exp(av/tau)/np.sum(np.exp(av/tau))
    return softm

  def choose_action(self):
    if self.soft_max:
      p = self.softmax(self.record[:, 1], tau=0.7)
      action = np.random.choice(np.arange(self.k), p=p)  
      return action

    #explore
    if random.random() > self.eps:
      action=np.random.randint(self.k)
    #exploit
    else:
      if self.ucb:
        if self.times == 0:
          action = np.random.randint(self.k)
        else:
          confidence_bound = self.record[:, 1] + self.c*np.sqrt(np.log(self.times)/(self.times+0.1))
          action = np.argmax(confidence_bound)
      else:
        action=np.argmax(self.record[:, 1], axis=0)

    return action

  def one_play(self, observation):
    action = self.choose_action()

    self.times = observation.step #update for ucb
    r = self.get_reward(observation)
#     r += self.initial_values[action] #optimistic initial value
    self.update_record(action, r)
    
    return int(action)

bandit = None
def multi_armed_bandit_agent(observation, configuration):
    global bandit
    if observation.step == 0:        
        bandit = Bandit(k=configuration['banditCount'], ucb=True, soft_max=False)
        action = bandit.one_play(observation)
    else:
        action = bandit.one_play(observation)
    
    return action




# Contextual Bandit

In [None]:
# # %%writefile submission_contextualbandit.py

# import torch as th
# import numpy as np
# from torch.autograd import Variable
# from matplotlib import pyplot as plt
# import random

# class ContextBandit:
#     def __init__(self, arms=10):
#         self.arms = arms
#         self.init_distribution(arms)
#         self.update_state()
        
#     def init_distribution(self, arms):
#         #num states equals num arms to keep things simple
#         self.bandit_matrix = np.random.rand(arms, arms)
        
#     def reward(self, prob, n=10):
#         reward = 0
#         for i in range(n):
#             if random.random() < prob:
#                 reward += 1
#         return reward
#     def get_state(self):
#         return self.state
    
#     def update_state(self):
#         self.state = np.random.randint(0, self.arms)
        
#     def get_reward(self, arm):
#         return self.reward(self.bandit_matrix[self.get_state()][arm])
    
#     def choose_arm(self, arm):
#         reward = self.get_reward(arm)
#         self.update_state()
#         return reward


# def softmax(av, tau=0.7):
#     n = len(av)
#     probs = np.zeros(n)
#     for i in range(n):
#         softm = (np.exp(av[i]/tau)/np.sum(np.exp(av[:]/tau)))
#         probs[i] = softm
#     return probs

# def one_hot(N, pos, val=1):
#     one_hot_vec = np.zeros(N)
#     one_hot_vec[pos] = val
#     return one_hot_vec

# arms = 10
# # N is the batch size, D_in is input dimension
# # H is hidden dimension D_out is output dimension
# N, D_in, H, D_out = 1, arms, 100, arms

# model = th.nn.Sequential(
#     th.nn.Linear(D_in, H),
#     th.nn.ReLU(),
#     th.nn.Linear(H, D_out),
#     th.nn.ReLU(),
# )
# loss_fn = th.nn.MSELoss(size_average=False)

# env = ContextBandit(arms)

# def train(env):
#     epochs = 50000
#     # one-hot encode current state
#     cur_state = Variable(th.Tensor(one_hot(arms, env.get_state())))
#     reward_hist = np.zeros(50)
#     reward_hist[:] = 5
#     runningMean = np.average(reward_hist)
#     lr = 1e-2
#     optimizer = th.optim.Adam(model.parameters(), lr=lr)
#     plt.xlabel("Plays")
#     plt.ylabel("Mean Reward")
#     for i in range(epochs):
#         y_pred = model(cur_state) # produce reward prediction
#         av_softmax = softmax(y_pred.data.numpy(), tau=2.0) #turn reward distribution into probability distribution
#         av_softmax /= av_softmax.sum() #make sure total prob adds to 1
#         action = np.random.choice(arms, p=av_softmax)
#         cur_reward = env.choose_arm(action)
#         one_hot_reward = y_pred.data.numpy().copy()
#         one_hot_reward[action] = cur_reward
#         reward = Variable(th.Tensor(one_hot_reward))
#         loss=loss_fn(y_pred, reward)

#         if i%50 == 0:
#             runningMean = np.average(reward_hist)
#             reward_hist[:] = 0
#             plt.scatter(i, runningMean)
#         reward_hist[i%50]=cur_reward
#         optimizer.zero_grad()
#         loss.backward()

#         optimizer.step()
#         cur_state = Variable(th.Tensor(one_hot(arms, env.get_state())))
        
#         th.save(model.state_dict(),'contextual_weights.pt')

# train(env)

In [None]:
%%writefile submission2new.py

import json
import numpy as np
import pandas as pd

bandit_state = None
total_reward = 0
last_step = None
    
def multi_armed_bandit_agent (observation, configuration):
    global history, history_bandit

    no_reward_step = 0.3
    decay_rate = 0.97 # how much do we decay the win count after each call
    
    global bandit_state,total_reward,last_step
        
    if observation.step == 0:
        # initial bandit state
        bandit_state = [[1,1] for i in range(configuration["banditCount"])]
    else:       
        # updating bandit_state using the result of the previous step
        last_reward = observation["reward"] - total_reward
        total_reward = observation["reward"]
        
        # we need to understand who we are Player 1 or 2
        player = int(last_step == observation.lastActions[1])
        
        if last_reward > 0:
            bandit_state[observation.lastActions[player]][0] += last_reward
        else:
            bandit_state[observation.lastActions[player]][1] += no_reward_step
        
        bandit_state[observation.lastActions[0]][0] = (bandit_state[observation.lastActions[0]][0] - 1) * decay_rate + 1
        bandit_state[observation.lastActions[1]][0] = (bandit_state[observation.lastActions[1]][0] - 1) * decay_rate + 1

#     generate random number from Beta distribution for each agent and select the most lucky one
    best_proba = -1
    best_agent = None
    for k in range(configuration["banditCount"]):
        proba = np.random.beta(bandit_state[k][0],bandit_state[k][1])
        if proba > best_proba:
            best_proba = proba
            best_agent = k
        
    last_step = best_agent
    return best_agent

# Random Agent

In [None]:
%%writefile random_agent.py

import random

def random_agent(observation, configuration):
    return random.randrange(configuration.banditCount)

# Test with default Agent

In [None]:
from kaggle_environments import make

env = make("mab", debug=True)

for i in range(5):
    env.run(["submission.py", "../input/santa-2020/submission.py"])
    p1_score = env.steps[-1][0]['reward']
    p2_score = env.steps[-1][1]['reward']
    env.reset()
    print(f"Round {i+1}: {p1_score} - {p2_score}")

# Test with other Agent

In [None]:
from kaggle_environments import make

env = make("mab", debug=True)

for i in range(5):
    env.run(["submission.py", "submission2new.py"])
    p1_score = env.steps[-1][0]['reward']
    p2_score = env.steps[-1][1]['reward']
    env.reset()
    print(f"Round {i+1}: {p1_score} - {p2_score}")

# Self Test

In [None]:
env.reset()
env.run(["submission.py", "submission.py"])
# env.render(mode="ipython", width=800, height=700)

p1_score = env.steps[-1][0]['reward']
p2_score = env.steps[-1][1]['reward']
print(f"Round: {p1_score} - {p2_score}")