In [12]:
import tensorflow as tf
import numpy as np

In [334]:
class Q:
    
    def __init__(self, nState, nAction, nodes, activations):
        
        self.nState   = nState  # The number of values in the state (size of the vector)
        self.nAction  = nAction # The number of possible actions. 4 in this case. (Note: 
                                # the size of the vector will always be 1)
        self.nodes       = nodes        # number of nodes per layer
        self.activations = activations  # activation function used in each layer
        self.weights1 = [] # Store all the static weights here
        self.weights2 = [] # Store all the dynamic weights here
        self.biases1  = [] # Store all the static weights here
        self.biases2  = [] # Store all the dynamic weights here
        self.assignFunctions   = []
        self.updateWtFunctions = {} # functions for assignning values to all weights
        self.placeholderNames = {}
        self.gamma = 1

        
        tf.reset_default_graph()
        
        # self.epsilon = tf.placeholder_with_default(0.8, shape = (), name='epsilon')
        self.epsilon = tf.placeholder(dtype=tf.float32, shape = (), name='epsilon')
        
        with tf.variable_scope('Inputs'):
            self.stateInput  = tf.placeholder(tf.float32, shape=(nState,), name='stateInput')
            self.placeholderNames['Inputs/stateInput'] = 'input the state vector (nState, )'
            
            self.actionInput = tf.placeholder(tf.float32, shape=(1,), name='actionInput')
            self.placeholderNames['Inputs/actionInput'] = 'input the action (1,)'
            
            self.nextStateInput = tf.placeholder(tf.float32, shape=(nState,), name='nextStateInput')
            self.placeholderNames['Inputs/nextStateInput'] = 'input the state vector for the next state (nState, )'
            
            self.rewardInput = tf.placeholder(tf.float32, shape=(), name='rewardInput')
            self.placeholderNames['Inputs/rewardInput'] = 'reward for the next state ()'
            
        # A state goes in and a value vector is calculated for all actions. We shall
        # later make a selection based upon the value functions of all the actions.
        # This is why this is called value based methods ...
        with tf.variable_scope('Combine'):
            #self.inp = tf.concat([self.stateInput, self.actionInput], axis=0)
            self.inp = self.stateInput * 1
            self.inp = tf.reshape(self.inp, (-1, 1), name='Inp')
            
        self.forwardPass( tf.reshape(self.nextStateInput, (-1, 1)), 
                         'StableWts',  True)   # These will be the static weights
        self.forwardPass( self.inp, 'DynamicWts', False) # These will be the dynamic weights
        
        # w1 = w2, assign all other weights ...
        # --------------------------------------
        self.assignFunctionsGenerate()
        
        self.maxAction    = self.policyMax(self.qVal1) # dynamic weights (current state)
        self.greedyAction = self.policyEpsilonGreedy(self.qVal2) # static weights (next state)
        
        # Difference between policy and current action
        # and all other error terms
        # -------------------------------------------------
        self.delta    = (self.rewardInput + self.gamma * self.qVal1[self.maxAction] - self.qVal2[self.greedyAction])
        self.delta    = tf.reduce_mean(self.delta)
        self.sqrErr   = self.delta**2
        self.priority = tf.abs(self.delta)
        
        self.opt = tf.train.AdamOptimizer().minimize( self.sqrErr )
        
        
        # self.policyError = self.qVal1
        
        self.init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(self.init, feed_dict={self.epsilon:0.5})
        
        return
    
    def close(self):
        self.sess.close()
    
    def policyMax(self, inpVec, name='policyMax'):
        with tf.variable_scope(name):
            result = tf.argmax(inpVec, name='maxVal')[0]
        return result
    
    def policyEpsilonGreedy(self, inpVec, name='policyEpsilonGreedy'):
        '''
        inpVec  -> input vector (either self.qVal1 or self.qVal2)
        
        '''
        
        with tf.variable_scope(name):
            
            # This is not strictly epsilon greedy, but will do for now
            result = tf.cond( tf.random_uniform(()) < self.epsilon, 
                       lambda : tf.multinomial( 10*tf.ones(( 1, self.nAction))  , 1)[0][0],
                       lambda : self.policyMax(inpVec) )
            
        return result
    
    def assignFunctionsGenerate(self):

        with tf.variable_scope('assignFunctions'):
            
            # ------------------------------------------------
            # Generate functions that will allow us
            # to update the static weights with the
            # dynamic weights
            # ------------------------------------------------            
            for w1, w2 in zip(self.weights1, self.weights2):
                self.assignFunctions.append( tf.assign( w1, w2 ) )

            for w1, w2 in zip(self.biases1, self.biases2):
                self.assignFunctions.append( tf.assign( w1, w2 ) )
                
            # ------------------------------------------------
            # We also need something that will allow us to put 
            # values into each of the weights if necessary
            # ------------------------------------------------
            for i, w in enumerate(self.weights1):
                p = tf.placeholder(tf.float32, 
                               w.shape, 
                               name='wStatic_{:05d}'.format( i ))
                self.placeholderNames['assignFunctions/wStatic_{:05d}'.format( i )] = 'w1_[{}]'.format(i)
                self.updateWtFunctions['wStatic_{:05d}'.format( i )] = tf.assign(w, p)
                
            for i, w in enumerate(self.biases1):
                p = tf.placeholder(tf.float32, 
                               w.shape, 
                               name='bStatic_{:05d}'.format( i ))
                self.placeholderNames['assignFunctions/bStatic_{:05d}'.format( i )] = 'b1_[{}]'.format(i)
                self.updateWtFunctions['bStatic_{:05d}'.format( i )] = tf.assign(w, p)
            
            for i, w in enumerate(self.weights2):
                p = tf.placeholder(tf.float32, 
                               w.shape, 
                               name='wDynamic_{:05d}'.format( i ))
                self.placeholderNames['assignFunctions/wDynamic_{:05d}'.format( i )] = 'w2_{}'.format(i)
                self.updateWtFunctions['wDynamic_{:05d}'.format( i )] = tf.assign(w, p)
                
            for i, w in enumerate(self.biases2):
                p = tf.placeholder(tf.float32, 
                               w.shape, 
                               name='bDynamic_{:05d}'.format( i ))
                self.placeholderNames['assignFunctions/bDynamic_{:05d}'.format( i )] = 'b2_{}'.format(i)
                self.updateWtFunctions['bDynamic_{:05d}'.format( i )] = tf.assign(w, p)
                
            
    
    def forwardPass(self, inpVec, name ='StableWts', static=True):
        
        with tf.variable_scope(name):
            
            prevN = self.nState # +1 Note that we wil not add the action term
            temp1 = inpVec * 1
            for i, (n, a) in enumerate(zip(self.nodes, self.activations)):
                with tf.variable_scope('layer_{:05d}'.format(i)):
                    w = tf.Variable(0.1*np.random.rand(n, prevN).astype(np.float32), name='W')
                    b = tf.Variable(np.zeros((n,1), np.float32), name='b')
                    if a is not None:
                        temp1 = a(tf.matmul( w, temp1 ) + b)
                    else:
                        temp1 = tf.matmul( w, temp1 ) + b
                    
                    prevN = n
                    
                    # Save them because we will have to update them
                    # halfway into the program
                    if static:
                        self.weights1.append(w)
                        self.biases1.append(b)
                    else:
                        self.weights2.append(w)
                        self.biases2.append(b)
                    
            if static:
                self.qVal1 = tf.multiply(temp1, 1, name='qVal1')
            else:
                self.qVal2 = tf.multiply(temp1, 1, name='qVal2')
        
        return
    
    def updateStaticWeights(self):
        for f in self.assignFunctions:
            self.sess.run(f)
        return
    
    def getWeights(self, static=True):
        if static:
            return self.sess.run(self.weights1+self.biases1)
        else:
            return self.sess.run(self.weights2+self.biases2)
    
    def run(self, s):
        
        result = self.sess.run([self.qVal1, self.qVal2], feed_dict = {
            'Inputs/stateInput:0'     : s,
            'Inputs/nextStateInput:0' : s,
        })

        return result



In [335]:
q = Q(36, 4, [3, 2, 4], [tf.tanh, tf.tanh, None]) # The last one is predicting a value

In [336]:
q.run(np.ones(36))

[array([[0.00728514],
        [0.00760357],
        [0.01888298],
        [0.01219106]], dtype=float32), array([[0.01616865],
        [0.00955882],
        [0.01626163],
        [0.01275421]], dtype=float32)]

In [337]:
q.sess.run( q.delta, feed_dict = {
    q.nextStateInput : np.ones(36),
    q.stateInput     : np.ones(36),
    q.epsilon        : 0.4,
    'Inputs/rewardInput:0' : 0.5
})

0.50271434

In [350]:
print( q.sess.run(
    q.qVal2, 
    feed_dict={q.nextStateInput : np.ones(36), q.stateInput: np.ones(36)}))
for i in range(30):
    tempV = q.sess.run(q.maxAction, feed_dict={q.nextStateInput : np.ones(36), q.stateInput: np.ones(36)})
    print(tempV, end=',')
print('')

[[0.01616865]
 [0.00955882]
 [0.01626163]
 [0.01275421]]
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,


In [344]:
print( q.sess.run(
    q.qVal1, 
    feed_dict={q.nextStateInput: np.ones(36)
              }))
for i in range(100):
    
    tempV = q.sess.run(q.greedyAction, 
                       feed_dict={
                           q.stateInput: np.ones(36),
                           q.nextStateInput: np.ones(36),
                           q.epsilon : 0.9
                       })
    print(tempV, end=',')
print('')

[[0.00728514]
 [0.00760357]
 [0.01888298]
 [0.01219106]]
2,2,3,0,3,1,2,1,3,0,2,0,2,2,0,2,0,0,0,3,0,1,2,2,1,3,2,1,2,2,2,1,1,2,1,0,0,3,2,1,2,2,0,1,3,2,2,2,2,0,2,0,0,0,1,0,3,3,3,1,0,0,2,2,1,2,3,2,1,0,2,0,0,2,2,1,1,1,0,1,1,3,2,2,1,3,2,1,2,0,2,3,3,2,2,1,0,0,2,3,


In [345]:
q.updateWtFunctions

{'wStatic_00000': <tf.Tensor 'assignFunctions/Assign_6:0' shape=(3, 36) dtype=float32_ref>,
 'wStatic_00001': <tf.Tensor 'assignFunctions/Assign_7:0' shape=(2, 3) dtype=float32_ref>,
 'wStatic_00002': <tf.Tensor 'assignFunctions/Assign_8:0' shape=(4, 2) dtype=float32_ref>,
 'bStatic_00000': <tf.Tensor 'assignFunctions/Assign_9:0' shape=(3, 1) dtype=float32_ref>,
 'bStatic_00001': <tf.Tensor 'assignFunctions/Assign_10:0' shape=(2, 1) dtype=float32_ref>,
 'bStatic_00002': <tf.Tensor 'assignFunctions/Assign_11:0' shape=(4, 1) dtype=float32_ref>,
 'wDynamic_00000': <tf.Tensor 'assignFunctions/Assign_12:0' shape=(3, 36) dtype=float32_ref>,
 'wDynamic_00001': <tf.Tensor 'assignFunctions/Assign_13:0' shape=(2, 3) dtype=float32_ref>,
 'wDynamic_00002': <tf.Tensor 'assignFunctions/Assign_14:0' shape=(4, 2) dtype=float32_ref>,
 'bDynamic_00000': <tf.Tensor 'assignFunctions/Assign_15:0' shape=(3, 1) dtype=float32_ref>,
 'bDynamic_00001': <tf.Tensor 'assignFunctions/Assign_16:0' shape=(2, 1) dtype

In [346]:
q.placeholderNames

{'Inputs/stateInput': 'input the state vector (nState, )',
 'Inputs/actionInput': 'input the action (1,)',
 'Inputs/nextStateInput': 'input the state vector for the next state (nState, )',
 'Inputs/rewardInput': 'reward for the next state ()',
 'assignFunctions/wStatic_00000': 'w1_[0]',
 'assignFunctions/wStatic_00001': 'w1_[1]',
 'assignFunctions/wStatic_00002': 'w1_[2]',
 'assignFunctions/bStatic_00000': 'b1_[0]',
 'assignFunctions/bStatic_00001': 'b1_[1]',
 'assignFunctions/bStatic_00002': 'b1_[2]',
 'assignFunctions/wDynamic_00000': 'w2_0',
 'assignFunctions/wDynamic_00001': 'w2_1',
 'assignFunctions/wDynamic_00002': 'w2_2',
 'assignFunctions/bDynamic_00000': 'b2_0',
 'assignFunctions/bDynamic_00001': 'b2_1',
 'assignFunctions/bDynamic_00002': 'b2_2'}

In [347]:
q.getWeights(True)

[array([[0.03299291, 0.03517313, 0.01645651, 0.04629924, 0.07629357,
         0.07499915, 0.0856449 , 0.04063104, 0.09009108, 0.09890855,
         0.03721462, 0.03205042, 0.06937148, 0.055203  , 0.01992404,
         0.06598932, 0.08348732, 0.05773957, 0.09873063, 0.06344263,
         0.09118128, 0.05935813, 0.00305091, 0.09668633, 0.03166735,
         0.0729511 , 0.01031388, 0.01391992, 0.04280156, 0.04658872,
         0.00974948, 0.07108812, 0.00645686, 0.0474534 , 0.04956342,
         0.08710636],
        [0.09790519, 0.07749604, 0.01509659, 0.03200435, 0.02563003,
         0.03834281, 0.02311203, 0.01210869, 0.04242229, 0.06922634,
         0.06226359, 0.08525556, 0.03264777, 0.01629683, 0.04144548,
         0.06429414, 0.03271007, 0.03990607, 0.07188293, 0.02998293,
         0.06224615, 0.00102955, 0.06358358, 0.07353483, 0.0291667 ,
         0.09187366, 0.05200893, 0.01049873, 0.09651186, 0.04837734,
         0.00489965, 0.08283591, 0.04444638, 0.00819625, 0.05516007,
         0.0

In [348]:
q.getWeights(False)

[array([[0.08625142, 0.04608831, 0.06468453, 0.09733008, 0.02491107,
         0.08204471, 0.09346557, 0.09515753, 0.08359794, 0.06385153,
         0.00373808, 0.02718518, 0.02246364, 0.02944267, 0.0657542 ,
         0.0727488 , 0.09074577, 0.08526561, 0.09323161, 0.03843711,
         0.01467886, 0.04171897, 0.0003069 , 0.06510345, 0.04899644,
         0.05537335, 0.0463918 , 0.03897854, 0.03099833, 0.02034565,
         0.06910374, 0.0618782 , 0.05305011, 0.09895297, 0.06035381,
         0.07349762],
        [0.06854769, 0.00095459, 0.0285784 , 0.03041754, 0.0331983 ,
         0.0205536 , 0.01216186, 0.08983107, 0.07342849, 0.03427576,
         0.00819802, 0.08042007, 0.02022109, 0.04257431, 0.00727592,
         0.05225666, 0.0928688 , 0.04494976, 0.09078857, 0.09893838,
         0.09578388, 0.02929911, 0.09645609, 0.00122733, 0.08841099,
         0.06323704, 0.04449945, 0.09391238, 0.06188601, 0.07680024,
         0.02170859, 0.05462886, 0.01623207, 0.08281913, 0.02752188,
         0.0

In [349]:
q.updateStaticWeights()

In [311]:
q.sess.close()

This will not work in real code ...

In [173]:
tf.reset_default_graph()

inpVec = tf.placeholder(shape=(4,), dtype=tf.float32)

epsilon   = tf.convert_to_tensor(0.5) 
probMax   = 1 - epsilon
probOther = epsilon/( 4 - 1 )

temp  = tf.ones(shape=(4,))*probOther
temp  = tf.Variable(temp)
temp  = temp[ tf.argmax(inpVec) ].assign( probMax )
temp1 = tf.multinomial(tf.log( tf.reshape(temp, shape=(1, -1))  ), 1)[0][0]

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(100):
        temp_V = sess.run(temp1, feed_dict={ inpVec : np.array([1,2,3,4]) })
        print(temp_V, end=',')
    print('')

2,1,3,1,0,3,1,3,3,3,3,3,1,0,2,3,3,0,3,3,0,0,2,3,0,3,3,3,2,1,2,3,3,3,1,1,3,3,3,3,3,3,3,3,3,2,0,1,3,2,3,3,0,0,2,3,0,1,1,0,3,0,2,1,1,1,0,3,3,0,3,2,3,3,0,3,0,3,0,3,2,3,3,3,3,1,2,1,2,0,3,1,3,0,3,0,3,3,3,0,


In [166]:
sum([0.16666667, 0.16666667, 0.16666667, 0.5       ])

1.00000001