In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        #List out our bandits. Currently arms 4, 2, and 1 (respectively) are the most optimal.
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0,len(self.bandits)) #Returns a random state for each episode.
        return self.state
        
   

In [57]:
class agent():
    def __init__(self, lr, s_size,a_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
        state_in_OH = tf.one_hot(self.state_in,s_size)
        output = tf.layers.dense(state_in_OH,a_size,\
            activation=tf.nn.sigmoid, use_bias=False, bias_initializer=None,kernel_initializer=tf.ones_initializer())
        self.output = tf.reshape(output,[-1])
        self.chosen_action = tf.argmax(self.output,0)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
        self.loss = (tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

In [8]:
tf.reset_default_graph() #Clear the Tensorflow graph.

cBandit = contextual_bandit() #Load the bandits.
myAgent = agent(lr=0.001,s_size=cBandit.num_bandits,a_size=cBandit.num_actions) #Load the agent.
weights = tf.trainable_variables()[0] #The weights we will evaluate to look into the network.

total_episodes = 10000 #Set total number of episodes to train agent on.
total_reward = np.zeros([cBandit.num_bandits,cBandit.num_actions]) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        s = cBandit.getBandit() #Get a state from the environment.
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})
        
        reward = cBandit.pullArm(action) #Get our reward for taking an action given a bandit.
        
        #Update the network.
        feed_dict={myAgent.reward_holder:[reward],myAgent.action_holder:[action],myAgent.state_in:[s]}
        _,ww = sess.run([myAgent.update,weights], feed_dict=feed_dict)
        
        #Update our running tally of scores.
        total_reward[s,action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))
        i+=1
for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

Mean reward for each of the 3 bandits: [-0.25  0.    0.  ]
Mean reward for each of the 3 bandits: [33.5  41.   33.25]
Mean reward for each of the 3 bandits: [72.5  77.25 69.5 ]
Mean reward for each of the 3 bandits: [108.25 121.75 103.75]
Mean reward for each of the 3 bandits: [146.5  161.75 136.5 ]
Mean reward for each of the 3 bandits: [186.   204.25 167.5 ]
Mean reward for each of the 3 bandits: [220.25 240.25 209.75]
Mean reward for each of the 3 bandits: [258.25 276.25 249.25]
Mean reward for each of the 3 bandits: [297.5  309.5  282.75]
Mean reward for each of the 3 bandits: [335.25 346.   320.5 ]
Mean reward for each of the 3 bandits: [372.   385.5  357.25]
Mean reward for each of the 3 bandits: [410.25 421.75 393.25]
Mean reward for each of the 3 bandits: [447.5  462.5  426.75]
Mean reward for each of the 3 bandits: [487.5  502.   461.75]
Mean reward for each of the 3 bandits: [523.5  541.   503.25]
Mean reward for each of the 3 bandits: [562.75 577.25 538.75]
Mean reward for e

In [29]:
myAgent.output

<tf.Tensor 'Reshape:0' shape=(4,) dtype=float32>

In [30]:
weights

<tf.Variable 'dense/kernel:0' shape=(3, 4) dtype=float32_ref>

In [65]:
import random

In [66]:
bandit_data = pd.DataFrame()

In [67]:
n = 10000
bandit_data['clicked_sports'] = np.random.choice(2, n, p=[0.6,0.4])
bandit_data['clicked_politics'] = np.random.choice(2, n, p=[0.7,0.3])
bandit_data['arm'] = np.random.choice(np.arange(0,3), n)

In [68]:
sp_map = {0:0.5,1:0.1,2:0.1}
p_map = {0:0.1,1:0.1,2:0.4}


In [69]:
bandit_data['sports_coef'] = bandit_data['arm'].map(sp_map)
bandit_data['politics_coef'] = bandit_data['arm'].map(p_map)


In [70]:
bandit_data['base'] = 1
bandit_data['click_factor'] =  bandit_data.base * bandit_data.arm_baseline + \
                      bandit_data.sports_coef * bandit_data.clicked_sports + \
                      bandit_data.politics_coef * bandit_data.clicked_politics

In [71]:
 def get_rew(row):
        #Get a random number.
        bandit = row['click_factor']
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

In [72]:
bandit_data['click'] = bandit_data.apply(get_rew,axis=1)

In [73]:
bandit_data.head()

Unnamed: 0,clicked_sports,clicked_politics,arm,sports_coef,politics_coef,arm_baseline,base,click_factor,click
0,1,0,0,0.5,0.1,0.1,1,0.6,1
1,0,0,2,0.1,0.4,0.1,1,0.1,1
2,0,0,1,0.1,0.1,0.2,1,0.2,-1
3,1,0,2,0.1,0.4,0.1,1,0.2,-1
4,1,1,0,0.5,0.1,0.1,1,0.7,1


In [83]:
bandit_data.click_factor.value_counts()

0.2    2953
0.1    2763
0.3    1533
0.6     951
0.5     605
0.6     411
0.7     407
0.4     377
Name: click_factor, dtype: int64

In [76]:
def get_state(row):
    if (row['clicked_sports']==1 and row['clicked_politics']==1):
        return 0
    elif (row['clicked_sports']==1 and row['clicked_politics']==0):
        return 1
    elif (row['clicked_sports']==0 and row['clicked_politics']==1):
        return 2
    else:
        return 3
        
        

In [77]:
bandit_data['state'] = bandit_data.apply(get_state,axis=1)

In [78]:
bandit_data.head()

Unnamed: 0,clicked_sports,clicked_politics,arm,sports_coef,politics_coef,arm_baseline,base,click_factor,click,state
0,1,0,0,0.5,0.1,0.1,1,0.6,1,1
1,0,0,2,0.1,0.4,0.1,1,0.1,1,3
2,0,0,1,0.1,0.1,0.2,1,0.2,-1,3
3,1,0,2,0.1,0.4,0.1,1,0.2,-1,1
4,1,1,0,0.5,0.1,0.1,1,0.7,1,0


In [93]:
bandit_data[(bandit_data.state==0) & (bandit_data.arm==1)]

Unnamed: 0,clicked_sports,clicked_politics,arm,sports_coef,politics_coef,arm_baseline,base,click_factor,click,state
61,1,1,1,0.1,0.1,0.2,1,0.4,-1,0
87,1,1,1,0.1,0.1,0.2,1,0.4,-1,0
97,1,1,1,0.1,0.1,0.2,1,0.4,1,0
189,1,1,1,0.1,0.1,0.2,1,0.4,1,0
210,1,1,1,0.1,0.1,0.2,1,0.4,1,0
219,1,1,1,0.1,0.1,0.2,1,0.4,-1,0
223,1,1,1,0.1,0.1,0.2,1,0.4,1,0
229,1,1,1,0.1,0.1,0.2,1,0.4,-1,0
247,1,1,1,0.1,0.1,0.2,1,0.4,-1,0
260,1,1,1,0.1,0.1,0.2,1,0.4,-1,0


In [80]:
bandit_data.groupby(['state','arm'])['click'].sum()

state  arm
0      0     -215
       1     -127
       2     -191
1      0     -429
       1     -213
       2     -216
2      0      -79
       1     -140
       2     -273
3      0     -126
       1     -268
       2     -147
Name: click, dtype: int64

In [84]:
bandit_data.groupby(['state','arm'])['click_factor'].mean()

state  arm
0      0      0.7
       1      0.4
       2      0.6
1      0      0.6
       1      0.3
       2      0.2
2      0      0.2
       1      0.3
       2      0.5
3      0      0.1
       1      0.2
       2      0.1
Name: click_factor, dtype: float64

In [82]:
tf.reset_default_graph() #Clear the Tensorflow graph.


myAgent = agent(lr=0.001,s_size=4,a_size=3) #Load the agent.
weights = tf.trainable_variables()[0] #The weights we will evaluate to look into the network.

total_reward = np.zeros([4,3]) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()
num_samples = len(bandit_data)
# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)

    for i in range(num_samples):
        s = bandit_data.loc[i]['state'].astype(int) #Get a state from the environment.
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            choose_action = np.random.randint(3)
        else:
            choose_action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})
            
        if bandit_data.loc[i]['arm'].astype(int) == choose_action:
            action = choose_action
           
       
        
            reward = bandit_data.iloc[i]['click'] #Get our reward for taking an action given a bandit.
        
            #Update the network.
            feed_dict={myAgent.reward_holder:[reward],myAgent.action_holder:[action],myAgent.state_in:[s]}
            _,ww = sess.run([myAgent.update,weights], feed_dict=feed_dict)

            #Update our running tally of scores.
            total_reward[s,action] += reward
            if i % 500 == 0:
                print("Mean reward for each of the " + str(4) + " bandits: " + str(np.mean(total_reward,axis=1)))
        else:
            continue
        
for a in range(4):
    print("The agent thinks action " + str(np.argmax(ww[a])) + " for bandit " + str(a) + " is the most promising....")
    

Mean reward for each of the 4 bandits: [0.         0.33333333 0.         0.        ]
Mean reward for each of the 4 bandits: [ -9.66666667 -13.          -3.66666667 -12.33333333]
Mean reward for each of the 4 bandits: [-20.         -25.          -7.66666667 -20.        ]
Mean reward for each of the 4 bandits: [-32.33333333 -39.         -14.66666667 -38.        ]
Mean reward for each of the 4 bandits: [-42.66666667 -48.33333333 -17.         -47.        ]
Mean reward for each of the 4 bandits: [-47.         -54.         -17.66666667 -45.66666667]
The agent thinks action 2 for bandit 0 is the most promising....
The agent thinks action 2 for bandit 1 is the most promising....
The agent thinks action 0 for bandit 2 is the most promising....
The agent thinks action 0 for bandit 3 is the most promising....


In [49]:
weights

<tf.Variable 'dense/kernel:0' shape=(4, 3) dtype=float32_ref>

In [52]:
 
output = tf.layers.dense(tf.one_hot([1],4),3,\
activation=tf.nn.sigmoid, use_bias=False, bias_initializer=None,kernel_initializer=tf.ones_initializer())
output = tf.reshape(output,[-1])

In [53]:
output.shape

TensorShape([Dimension(3)])