<a href="https://colab.research.google.com/github/smokingelephants/CS898BG/blob/main/Policy_Gradient_Contextual_Bandits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Solving Contextual Bandit problem using Policy Gradients

Import Libraries

In [None]:
#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#import tf.contrib.slim as slim
import tf_slim as slim
import numpy as np

Instructions for updating:
non-resource variables are not supported in the long term


The problem : Contextual Bandits
    
*  The environment consists of several states(bandits) and are independent of each other. 
*  Given a state(bandit) the agent understands the environment and tries to make best possible action(pull arm) which results in better rewards.

In [None]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        self.bandits = np.array([[0.2, 0.0, -0.0, -5], [0.1, -5, 1, 0.25], [-5, 5, 5, 5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        #Choosing a random bandit 
        self.state = np.random.randint(0, self.num_bandits)
        return self.state
    
    def pullArm(self, action):
        #
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        if(result > bandit):
            return +2 #1
        else:
            return -2 #-1

Let's define the agent...

In [None]:
class agent():
    def __init__(self, lr, s_size, a_size):
        #defining feed-forward network which takes states as input and produce action as output.
        self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
        output = slim.fully_connected(state_in_OH, a_size, biases_initializer=None, activation_fn=tf.nn.softmax, weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output, [-1])
        self.choosen_action = tf.argmax(self.output, 0)
        
        #We train the neural network by feeding the reward and choosen action to compute loss and therefore update the network
        self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
        self.loss = -(tf.log(self.responsible_weight) * self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

Training the agent:

In [None]:
#tf.reset_default_graph()  
tf.compat.v1.reset_default_graph()
#Loading the contextual bandit.
cBandit = contextual_bandit()
#Loading the agent.
myAgent = agent(lr=0.001, s_size=cBandit.num_bandits, a_size=cBandit.num_actions)
#Lets define the weights which are optimizer during the training phase for making maximum rewards
weights = tf.trainable_variables()[0]

#The total number of games played by the agent.
total_episodes = 20000 
#Set the scoreboard to zero.
total_rewards = np.zeros([cBandit.num_bandits, cBandit.num_actions])
#The chance of taking a random action.
e = 0.1

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while(i<total_episodes):
        #Get a random state from the environment.
        s = cBandit.getBandit()
        
        #Choose either a random action or one from our network.
        if(np.random.rand(1) < e):
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.choosen_action, feed_dict={myAgent.state_in: [s]})
            action_prob = sess.run(myAgent.output, feed_dict={myAgent.state_in: [s]})
        
        #Now lets perform the action(pull arm) to observe the reward. 
        reward = cBandit.pullArm(action)
        
        #Update the network
        feed_dict = {myAgent.reward_holder: [reward], myAgent.action_holder: [action], myAgent.state_in: [s]}
        _, ww = sess.run([myAgent.update, weights], feed_dict=feed_dict)
        #import pdb; pdb.set_trace()
        #Update our scoreboard
        total_rewards[s, action] += reward
        if(i%500==0):
            print(action_prob)
            print("The average reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_rewards, axis=1)))
        i += 1
 
#Let's evaluate our actions.
for a in range(cBandit.num_bandits):
    print("The most likely action is " + str(np.argmax(ww[a])+1) + " probs: " + str(ww[a]) + " for bandit " + str(a+1))
    if(np.argmax(ww[a]) == np.argmin(cBandit.bandits[a])):
        print(":) Yes, its correct!")
    else:
        print(":( No, it isn't!")

[0.25 0.25 0.25 0.25]
The average reward for each of the 3 bandits: [ 0.  -0.5  0. ]
[0.23068893 0.31239697 0.2284508  0.22846325]
The average reward for each of the 3 bandits: [71.  78.5 74. ]
[0.21143505 0.3735229  0.20711924 0.20792273]
The average reward for each of the 3 bandits: [149.5 153.  135. ]
[0.19015871 0.4364815  0.18653719 0.18682252]
The average reward for each of the 3 bandits: [228.  227.  203.5]
[0.17071564 0.49341536 0.16733402 0.16853501]
The average reward for each of the 3 bandits: [301.5 305.5 279.5]
[0.53778005 0.15406731 0.15406096 0.15409167]
The average reward for each of the 3 bandits: [381.5 376.5 338.5]
[0.5870815  0.13736369 0.13817228 0.13738255]
The average reward for each of the 3 bandits: [461.  452.  417.5]
[0.12419049 0.1222432  0.123542   0.6300243 ]
The average reward for each of the 3 bandits: [543.5 516.  492. ]
[0.11675537 0.656816   0.11267662 0.11375202]
The average reward for each of the 3 bandits: [627.5 582.  562. ]
[0.10522301 0.69148296