In [105]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [508]:
class contexual_bandit():
    '''
    makes 3 bandits and each bandits arm has its number that will have better chance of giving you reward 
    if it is lower
    '''
    def __init__(self,numB=3,numA=4):
        self.state = 0
        self.bandits = np.random.rand(numB,numA)
        
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0,len(self.bandits))
        
        return self.state
    def pullArm(self,action):
        bandit = self.bandits[self.state,action]
        res = np.random.rand(1)
        
        if(res < bandit):
            return 1
        else:
            return -1

        
        
    

In [509]:
class Agent():
    '''
    an agent that will play with the arms and find which ones look the most lucrative
    with diffrent types of implimentation you can use
    '''
    def __init__(self, lr, s_size, a_size,typeAgent=0,mean=0.1):
        self.state_in = tf.placeholder(shape=[1],dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in,s_size)
        if(typeAgent==0):
            output = slim.fully_connected(state_in_OH,a_size,biases_initializer=None,activation_fn=tf.nn.sigmoid,weights_initializer=tf.ones_initializer())
        elif (typeAgent==1):
            output = slim.fully_connected(state_in_OH,a_size,biases_initializer=None,activation_fn=tf.nn.softmax,weights_initializer=tf.ones_initializer())
        elif (typeAgent==2):
            output = slim.fully_connected(state_in_OH,a_size,biases_initializer=tf.truncated_normal_initializer(mean=1,stddev=1e-7),activation_fn=tf.nn.sigmoid,weights_initializer=tf.truncated_normal_initializer(mean=1,stddev=1e-5))
        elif (typeAgent==3):
            output = slim.fully_connected(state_in_OH,a_size,biases_initializer=tf.truncated_normal_initializer(mean=1,stddev=1e-7),activation_fn=tf.nn.softmax,weights_initializer=tf.truncated_normal_initializer(mean=1,stddev=1e-5))
            
        else:
            middle = slim.fully_connected(state_in_OH,10,biases_initializer=tf.truncated_normal_initializer(mean=mean,stddev=2e-3),activation_fn=None,weights_initializer=tf.truncated_normal_initializer(mean=mean,stddev=2e-2))
            output = slim.fully_connected(middle,a_size,biases_initializer=tf.truncated_normal_initializer(mean=mean,stddev=2e-3),activation_fn=tf.nn.sigmoid,weights_initializer=tf.truncated_normal_initializer(mean=mean,stddev=2e-2))

        self.output = tf.reshape(output,[-1])
        self.chosen_action = tf.arg_max(self.output,0)
        
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        
        self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = self.optimizer.minimize(self.loss)
        
        

In [543]:
def tryAgent(episods=10000,maximumChance=0.8,minimumChance=0.1,toBePrinted=False,whichAgent=0,printLast=True,trueReward=1,nBandits=3,nActions=4,everyEps=500,basicShow=True):
    tf.reset_default_graph()
    # first let's make our env
    cBandit = contexual_bandit(numB=nBandits,numA=nActions)
    # and then our agent
    myAgent = Agent(lr=0.001, s_size=cBandit.num_bandits, a_size=cBandit.num_actions,typeAgent=whichAgent)
    trainables = tf.trainable_variables()
    total_episodes = episods
    total_reward = np.zeros(shape=[cBandit.num_bandits,cBandit.num_actions])
    e=maximumChance
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess :
        sess.run(init)
        i=0
        while i < total_episodes:
            s = cBandit.getBandit()

            if np.random.rand(1) < e :
                action = np.random.randint(cBandit.num_actions)
            else:
                action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})

            reward = cBandit.pullArm(action)*trueReward

            feed_dict={myAgent.state_in:[s],myAgent.action_holder:[action],myAgent.reward_holder:[reward]}

            _ = sess.run([myAgent.update],feed_dict=feed_dict)


            total_reward[s,action] += reward

            if i%everyEps == 0:
                if basicShow:
                    print('mean of every one of the actions :'+str(np.mean(total_reward,axis=1)))
                if i is not 0:
                    e = min(max(e/(i/2000),minimumChance),maximumChance)
                if(toBePrinted):
                    for a in range(cBandit.num_bandits):
                        action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[a]})
                        print('the agent thinks the suitable action for this bandit is : '+str(action))
                        if action == np.argmax(cBandit.bandits[a]):
                            print('and it was right ...')
                        else:
                            print('and it was wrong ...')
                        print()
                    print
            i+=1
        per = 0
        for a in range(cBandit.num_bandits):
            
            action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[a]})
            if printLast :
                print('the agent thinks the suitable action for this bandit is : '+str(action))
            if action == np.argmax(cBandit.bandits[a]):
                per+=1
                if printLast:
                    print('and it was right ...')
            else :
                if printLast:
                    print('and it was wrong ...')
            print()
        print(str((per/cBandit.num_bandits)*100)+" percent of the time the agent works correctly")
        return per
        
        

In [542]:
np.random.seed(1)
tryAgent(episods=10000)

mean of every one of the actions :[0.25 0.   0.  ]
mean of every one of the actions :[ -7.75 -17.    -0.5 ]
mean of every one of the actions :[-15.   -40.     1.75]
mean of every one of the actions :[-19.   -65.75   2.  ]
mean of every one of the actions :[-27.25 -93.5   10.5 ]
mean of every one of the actions :[ -36.75 -117.75   14.75]
mean of every one of the actions :[ -39.   -135.75   22.5 ]
mean of every one of the actions :[ -32.25 -152.25   35.25]
mean of every one of the actions :[ -22.5  -175.     47.25]
mean of every one of the actions :[  -8.25 -189.75   54.75]
mean of every one of the actions :[   8.   -206.25   58.  ]
mean of every one of the actions :[  22.   -226.5    76.25]
mean of every one of the actions :[  41.   -248.5    95.25]
mean of every one of the actions :[  60.25 -276.    109.  ]
mean of every one of the actions :[  76.5  -298.5   122.75]
mean of every one of the actions :[  90.75 -322.25  139.75]
mean of every one of the actions :[ 103.   -342.75  153.5 ]
m

In [546]:
res1 = []
res2 = []
for i in range(100):
    if i%10==0:
        print(i)
    np.random.seed(i)
    res1.append(tryAgent(episods=10000,nBandits=10,nActions=5,basicShow=False))
    res2.append(tryAgent(whichAgent=1,episods=10000,nBandits=10,nActions=5,basicShow=False))

0
the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

70.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

70.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

100.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this 

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

100.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this 

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

60.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

60.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

70.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

70.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

70.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 0
and it was wrong ...

50.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was right ...

80.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was wrong ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

50.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 2
and it was wrong ...

the agent thinks the suitable action for this b

the agent thinks the suitable action for this bandit is : 2
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 3
and it was wrong ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 4
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 1
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

the agent thinks the suitable action for this bandit is : 0
and it was right ...

90.0 percent of the time the agent works correctly
the agent thinks the suitable action for this bandit is : 3
and it was right ...

the agent thinks the suitable action for this b

In [552]:
print((sum(res1)/(10))*100)

7780.0


In [553]:
print((sum(res2)/(10))*100)

7559.999999999999


In [555]:
print(((sum(res1)/(10))*100)/100)

77.8


In [557]:
print(((sum(res2)/(10))*100)/100)

75.6


In [540]:
np.random.seed(10)
tryAgent(whichAgent=1,episods=10000,nBandits=10,nActions=5,minimumChance=0.2)

mean of every one of the actions :[0.  0.  0.  0.  0.  0.  0.2 0.  0.  0. ]
mean of every one of the actions :[ 1.4 -4.   2.2  5.  -3.4  3.6  1.4 -2.   3.   2.2]
mean of every one of the actions :[ 6.4 -7.8  7.   7.8 -5.2  7.2  5.   0.   4.8  5.8]
mean of every one of the actions :[ 9.4 -9.  12.6 11.2 -6.   7.6  6.6 -0.2  5.6  5.6]
mean of every one of the actions :[  8.8 -12.   15.6  13.   -7.2   8.4   9.2  -1.8   6.6   7.2]
mean of every one of the actions :[ 11.2 -14.8  17.   16.2  -5.4   9.8  10.2  -2.8   9.4   7.8]
mean of every one of the actions :[ 15.  -14.   21.2  21.2  -7.8  12.2  15.4  -1.8  11.4  11. ]
mean of every one of the actions :[ 18.2 -13.   29.   23.2  -8.   15.   17.2   0.6  13.6  15.6]
mean of every one of the actions :[ 22.8 -10.   38.   27.8  -7.   16.6  21.2   5.8  15.6  21.4]
mean of every one of the actions :[27.8 -7.4 47.8 29.6 -5.8 18.4 25.2  9.  21.  24.2]
mean of every one of the actions :[34.6 -2.6 54.6 35.8 -1.2 19.6 31.2 13.  25.  27.4]
mean of every 

In [533]:
np.random.seed(1)
tryAgent(whichAgent=1,episods=20000,nBandits=10,nActions=5,everyEps=1000)

mean of every one of the actions :[ 0.   0.   0.   0.   0.  -0.2  0.   0.   0.   0. ]
mean of every one of the actions :[-5.2 -5.6  0.2 -0.2 10.6  1.8  6.4 -0.2  6.4  0.8]
mean of every one of the actions :[-10.8  -9.6  -0.8  -4.4  20.    5.4  13.4   2.6  11.4   0.2]
mean of every one of the actions :[-16.4 -15.8  -0.4  -7.6  30.8   7.   18.    4.8  14.8  -0.6]
mean of every one of the actions :[-19.2 -17.    5.6  -7.2  45.   10.   26.8   8.2  25.8   4.6]
mean of every one of the actions :[-18.4 -20.2  14.4  -3.6  56.   22.2  42.2  16.   39.6  17.2]
mean of every one of the actions :[-10.4 -19.   25.2   1.   71.6  34.2  59.8  25.4  61.6  33.2]
mean of every one of the actions :[ -2.4 -16.2  39.6   5.4  85.8  50.4  76.4  34.   81.2  46.4]
mean of every one of the actions :[  6.  -15.2  52.6  14.4 103.   63.   91.8  44.8  97.6  62.2]
mean of every one of the actions :[ 14.6 -12.6  65.8  16.4 118.4  74.6 105.2  53.4 117.2  76. ]
mean of every one of the actions :[ 18.2 -12.6  79.8  20.6 1

In [534]:
np.random.seed(1)
tryAgent(whichAgent=1,episods=40000,nBandits=10,nActions=5,everyEps=2000)

mean of every one of the actions :[ 0.   0.   0.   0.   0.  -0.2  0.   0.   0.   0. ]
mean of every one of the actions :[-10.8  -9.6  -0.8  -4.4  20.    5.4  13.4   2.6  11.4   0.2]
mean of every one of the actions :[-20.2 -22.   -0.6  -9.6  42.8   9.2  25.2   5.8  17.8  -2.2]
mean of every one of the actions :[-14.4 -26.8  18.4  -6.4  63.   25.2  48.8  21.8  41.6  19. ]
mean of every one of the actions :[  0.8 -26.8  43.2   3.6  95.   50.   79.6  40.6  76.2  46.8]
mean of every one of the actions :[ 10.4 -23.   71.8  10.  128.2  75.8 110.6  57.2 116.6  76.6]
mean of every one of the actions :[ 23.4 -23.6  98.4  23.  164.4 104.4 144.4  76.  156.  110.6]
mean of every one of the actions :[ 39.8 -20.  124.4  40.6 200.  134.4 179.   91.6 194.4 139.6]
mean of every one of the actions :[ 57.  -14.6 154.   53.6 239.  164.  210.2 105.6 233.6 165.8]
mean of every one of the actions :[ 73.  -13.6 179.8  66.4 276.4 188.  244.6 122.8 266.4 196.4]
mean of every one of the actions :[ 88.6 -13.4 210

In [537]:
np.random.seed(1)
tryAgent(whichAgent=2,episods=40000,nBandits=4,nActions=5,everyEps=2000)

mean of every one of the actions :[ 0.  -0.2  0.   0. ]
mean of every one of the actions :[-20.2 -41.2  -5.6 -19.2]
mean of every one of the actions :[-38.  -83.8  -2.4 -35.6]
mean of every one of the actions :[ -31.4 -129.2   23.   -57.4]
mean of every one of the actions :[   2.6 -171.2   49.4  -75. ]
mean of every one of the actions :[  37.  -218.4   85.4  -82.2]
mean of every one of the actions :[  73.6 -257.8  123.4  -88.6]
mean of every one of the actions :[ 113.  -284.   154.6  -97.4]
mean of every one of the actions :[ 148.  -304.4  191.2 -113.4]
mean of every one of the actions :[ 186.8 -333.6  225.8 -134.4]
mean of every one of the actions :[ 220.8 -351.2  256.4 -149.4]
mean of every one of the actions :[ 250.6 -364.   280.6 -163.8]
mean of every one of the actions :[ 281.  -384.   316.2 -181.8]
mean of every one of the actions :[ 316.8 -392.   344.  -202.6]
mean of every one of the actions :[ 358.  -408.   376.2 -216.8]
mean of every one of the actions :[ 386.6 -400.6  405.4 