In [1]:
import tensorflow as tf

In [2]:
def create_net(input_state, variable_scope, layer_list, trainable):
    with tf.variable_scope(variable_scope):
        layer1 = tf.layers.Dense(units=10, activation=tf.nn.relu,
                                 name="layer", trainable=trainable)
        layer_list.append(layer1)
        layer1_output = layer1(input_state)
        layer2 = tf.layers.Dense(units=10, activation=tf.nn.relu, 
                                 name="layer", trainable=trainable)
        layer_list.append(layer2)
        layer2_output = layer2(layer1_output)
        output_layer = tf.layers.Dense(units=2, name="layer", trainable=trainable)
        layer_list.append(output_layer)
        output = output_layer(layer2_output)

        return output

In [3]:
input_state = tf.placeholder(shape=[None, 1], dtype=tf.float32)
predict_layer_list = []
predict_net = create_net(input_state=input_state, variable_scope="predict",
                         layer_list=predict_layer_list, trainable=True)
target_layer_list = []
target_net = create_net(input_state=input_state, variable_scope="target",
                        layer_list=target_layer_list, trainable=False)

In [4]:
init_op = tf.global_variables_initializer()
sess = tf.Session()

In [5]:
sess.run(init_op)

In [6]:
print(sess.run(predict_net, feed_dict={input_state: [[1]]}))

[[0.04790387 0.10965639]]


In [7]:
print(sess.run(target_net, feed_dict={input_state: [[1]]}))

[[ 0.6123961  -0.22846693]]


### Copy net operation

In [8]:
for predict_layer, target_layer in zip(predict_layer_list, target_layer_list):
    predict_variable_list = predict_layer.variables
    target_variable_list = target_layer.variables
    for predict_variable, target_variable in zip(predict_variable_list, target_variable_list):
        assign_op = tf.assign(ref=target_variable, value=predict_variable)
        sess.run(assign_op)

In [9]:
print(sess.run(target_net, feed_dict={input_state: [[1]]}))

[[0.04790387 0.10965639]]


## Learning Test

In [10]:
predict_action = tf.argmax(predict_net, axis=1)
predict_q_value = tf.reduce_max(predict_net, axis=1)

In [11]:
print(sess.run(predict_action, feed_dict={input_state: [[1]]}))
print(sess.run(predict_q_value, feed_dict={input_state: [[1]]}))

[1]
[0.10965639]


In [12]:
experience = [1, 0, -1, 2, False]

In [13]:
target_q_value = tf.reduce_max(target_net, axis=1)

In [14]:
print(sess.run(target_q_value, feed_dict={input_state: [[1]]}))

[0.10965639]


In [15]:
tf_reward = tf.placeholder(shape=[None, 1], dtype=tf.float32)

In [16]:
y = tf.add(tf_reward, target_q_value)

In [17]:
print(sess.run(y, feed_dict={input_state: [[1]], tf_reward: [[experience[2]]]}))

[[-0.8903436]]


In [18]:
loss = tf.losses.mean_squared_error(predictions=predict_q_value, labels=y[0])

In [19]:
print(sess.run(loss, feed_dict={input_state: [[1]], tf_reward: [[experience[2]]]}))

1.0


In [20]:
train = tf.train.GradientDescentOptimizer(0.01).minimize(loss)

In [21]:
for i in range(100):
  _, loss_value = sess.run((train, loss), feed_dict={input_state: [[1], [2], [3], [4]], tf_reward: [[-1], [20], [-1], [-1]]})
  print(loss_value)

1.0
0.6003639
0.45393926
0.3321009
0.2535795
0.21119145
0.17944601
0.15497983
0.14150068
0.12924963
0.12131343
0.11604818
0.11119574
0.10754945
0.10581057
0.10278611
0.101562634
0.09932118
0.09762591
0.096415184
0.094573915
0.09346283
0.0923262
0.09079591
0.08965494
0.088942416
0.087464415
0.086167656
0.085579306
0.08437022
0.08302992
0.08212857
0.08141091
0.080003135
0.07889601
0.078570716
0.07755023
0.07626569
0.07570253
0.07499158
0.07362555
0.073180474
0.07237211
0.071150064
0.07054392
0.06997928
0.06873441
0.06814961
0.06762027
0.06635632
0.0659662
0.06531155
0.064101666
0.06359727
0.06306888
0.061920203
0.061462086
0.060887553
0.059742592
0.059455015
0.05883744
0.05773065
0.05728402
0.056785017
0.055760223
0.055312395
0.054937657
0.05377043
0.053547926
0.05285124
0.051970776
0.051552504
0.051113736
0.050167583
0.049751807
0.049342327
0.04843083
0.04807324
0.047599427
0.046699632
0.046346717
0.046011552
0.045081485
0.044724204
0.04429587
0.043553274
0.04309176
0.042802945
0.042056

In [22]:
print(sess.run(predict_net, feed_dict={input_state: [[1]]}))

[[-0.5819767 -0.5794035]]


In [23]:
print(sess.run(target_net, feed_dict={input_state: [[1]]}))

[[0.04790387 0.10965639]]
