In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pymc3 as pm

plt.style.use('grayscale')
plt.style.use('seaborn-whitegrid')
np.random.seed(0)



In [2]:
arms = [[0, 0], [0, 1], [1, 0], [1, 1]]

In [3]:
class Env(object):
  def p(arm):
    x = arm[0] * 0.2 + arm[1] * 0.8 - 4
    p = 1 / (1 + np.exp(-x))
    return p

  def react(arm):
    return 1 if np.random.random() < Env.p(arm) else 0

  def opt():
    return np.argmax([Env.p(arm) for arm in arms])

In [4]:
class OracleAgent(object):
  def __init__(self):
    self.arm = Env.opt()

  def get_arm(self):
    return self.arm

  def sample(self, arm, reward):
    pass

In [5]:
class LinUCBAgent(object):
  def __init__(self):
    self.phis = np.array([[arm[0], arm[1], 1] for arm in arms]).T
    self.alpha = 1
    self.sigma = 1
    self.A = np.identity(self.phis.shape[0])
    self.b = np.zeros((self.phis.shape[0], 1))

  def get_arm(self):
    inv_A = np.linalg.inv(self.A)
    mu = inv_A.dot(self.b)
    S = inv_A
    pred_mean = self.phis.T.dot(mu)
    pred_var = self.phis.T.dot(S).dot(self.phis)
    ucb = pred_mean.T + self.alpha * np.sqrt(np.diag(pred_var))
    return np.argmax(ucb)

  def sample(self, arm_index, reward):
    phi = self.phis[:, [arm_index]]
    self.b = self.b + phi * reward / (self.sigma ** 2)
    self.A = self.A + phi.dot(phi.T) / (self.sigma ** 2)