In [None]:
# I want to compare a few approaches to solving the multi-arm bandit problem.
# To me, this seems like a stupid application of reinforcement learning. Standard
# statistical modelling methods should outperform RL on a per sample basis, i.e.,
# for a fixed sized training set, statistical modelling will give larger average
# payouts. The experimental setup is as follows:
#
# Consider an n-arm bandit whose payouts are given by:
#   i. n Gaussian random variables
#   ii. n uniform random variables
#   iii. A mixture of i. and ii.
#
# ... TBD
#

In [2]:
import numpy as np
import tensorflow as tf

In [45]:
class BanditGaussian():
    
    def __init__(self, n=4, mu=[0.1, 0.2, 0.5, 0.9], sigma=[10, 1, 8, 4]):
        
        self.n = n
        self.mu = mu
        self.sigma = sigma
        assert n == len(mu) == len(sigma)
        
    def draw(self, lever, N=1):
        
        mu, sigma = self.mu[lever], self.sigma[lever]
        return mu + sigma * np.random.randn(N)

In [49]:
class BanditBernouli():
    
    def __init__(self, n=4, ps=[0.1, 0.2, 0.5, 0.9]):
        
        self.n = n
        self.ps = ps
        assert 0 <= min(ps) <= max(ps) <= 1
        
    def draw(self, lever, N=1):
        
        p = self.ps[lever]
        return np.random.rand(N) > p

In [4]:
    
def Agent():
    
    def __init__(self, bandit):
        
        self.bandit = bandit

# Super naive

In [46]:
b = BanditGaussian()

In [31]:
#
# pull each lever until we have a confident estimate of the mean reward
# this assumes an underlying Gaussian distribution. but that doesn't mean
# it wouldn't work for other distributions.
#

N = 1000

for lever in [0, 1, 2, 3]:
    mu = 1.
    dmu = 10000 * mu
    rewards = np.array([])
    
    while abs(dmu / mu) > 0.1:
        rewards = np.hstack((rewards, b.draw(lever, N=N)))
        mu = rewards.mean()
        dmu = rewards.std() / np.sqrt(len(rewards))
    
    print lever, rewards.mean(), rewards.std(), dmu / mu, len(rewards)

0 0.10242290394681143 10.001510741608246 0.09997567231516227 954000
1 0.1890801221704046 0.9836282578851897 0.09497832256840566 3000
2 0.46654899071953837 8.000647610829754 0.09739732766762933 31000
3 0.8816140314581021 3.9928211746221645 0.08268766706627168 3000


In [54]:
b = BanditBernouli()

In [56]:
#
# pull each lever until we have a confident estimate of the mean reward
# this assumes an underlying Gaussian distribution. but that doesn't mean
# it wouldn't work for other distributions.
#

N = 100

for lever in [0, 1, 2, 3]:
    mu = 1.
    dmu = 10000 * mu
    rewards = np.array([])
    
    while abs(dmu / mu) > 0.1:
        rewards = np.hstack((rewards, b.draw(lever, N=N)))
        mu = rewards.mean()
        dmu = rewards.std() / np.sqrt(len(rewards))
    
    print lever, rewards.mean(), rewards.std(), dmu / mu, len(rewards)

0 0.89 0.31288975694324034 0.03515615246553262 100
1 0.81 0.39230090491866065 0.04843221048378526 100
2 0.47 0.49909918853871116 0.07508860014902678 200
3 0.10777777777777778 0.3100995459446237 0.09590707606534754 900


# RL approach

In [50]:
b = BanditBernouli()

In [51]:
tf.reset_default_graph()

# These two lines established the feed-forward part of the network.
# This does the actual choosing.
weights = tf.Variable(tf.ones([b.n]))
chosen_action = tf.argmax(weights,0)

# The next six lines establish the training proceedure. We feed the 
# reward and chosen action into the network to compute the loss, and 
# use it to update the network.
reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
responsible_weight = tf.slice(weights, action_holder,[1])
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)

In [52]:
total_episodes = 1000 #Set total number of episodes to train agent on.
total_reward = np.zeros(b.n) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.initialize_all_variables()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(b.n)
        else:
            action = sess.run(chosen_action)
        
        reward = b.draw(action)[0] #Get our reward from picking one of the bandits.
        
        #Update the network.
        _,resp,ww = sess.run([update,responsible_weight,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
        
        #Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print "Running reward for the " + str(b.n) + " bandits: " + str(total_reward)
        i+=1
print "The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising...."
if np.argmax(ww) == np.argmax(-np.array(b.ps)):
    print "...and it was right!"
else:
    print "...and it was wrong!"

Running reward for the 4 bandits: [1. 0. 0. 0.]
Running reward for the 4 bandits: [42.  0.  1.  0.]
Running reward for the 4 bandits: [79.  0.  2.  0.]
Running reward for the 4 bandits: [122.   2.   3.   0.]
Running reward for the 4 bandits: [163.   2.   3.   0.]
Running reward for the 4 bandits: [206.   3.   3.   0.]
Running reward for the 4 bandits: [250.   6.   3.   0.]
Running reward for the 4 bandits: [293.   8.   3.   2.]
Running reward for the 4 bandits: [327.   8.   3.   2.]
Running reward for the 4 bandits: [372.   8.   3.   3.]
Running reward for the 4 bandits: [413.   8.   3.   3.]
Running reward for the 4 bandits: [455.  10.   3.   3.]
Running reward for the 4 bandits: [494.  12.   4.   4.]
Running reward for the 4 bandits: [534.  12.   4.   4.]
Running reward for the 4 bandits: [576.  12.   5.   4.]
Running reward for the 4 bandits: [617.  14.   7.   4.]
Running reward for the 4 bandits: [663.  15.   7.   4.]
Running reward for the 4 bandits: [706.  17.   8.   4.]
Running 