In [2]:
import tensorflow as tf

In [3]:
def create_net(input_state, variable_scope, layer_list, trainable):
    with tf.variable_scope(variable_scope):
        layer1 = tf.layers.Dense(units=10, activation=tf.nn.relu,
                                 name="layer", trainable=trainable)
        layer_list.append(layer1)
        layer1_output = layer1(input_state)
        layer2 = tf.layers.Dense(units=10, activation=tf.nn.relu, 
                                 name="layer", trainable=trainable)
        layer_list.append(layer2)
        layer2_output = layer2(layer1_output)
        output_layer = tf.layers.Dense(units=2, name="layer", trainable=trainable)
        layer_list.append(output_layer)
        output = output_layer(layer2_output)

        return output

In [4]:
input_state = tf.placeholder(shape=[None, 1], dtype=tf.float32)
predict_layer_list = []
predict_net = create_net(input_state=input_state, variable_scope="predict",
                         layer_list=predict_layer_list, trainable=True)
target_layer_list = []
target_net = create_net(input_state=input_state, variable_scope="target",
                        layer_list=target_layer_list, trainable=False)

In [5]:
init_op = tf.global_variables_initializer()
sess = tf.Session()

In [6]:
sess.run(init_op)

In [7]:
print(sess.run(predict_net, feed_dict={input_state: [[1]]}))

[[0.507277   0.12137053]]


In [8]:
print(sess.run(target_net, feed_dict={input_state: [[1]]}))

[[-0.9207109  0.359486 ]]


### Copy net operation

In [9]:
for predict_layer, target_layer in zip(predict_layer_list, target_layer_list):
    predict_variable_list = predict_layer.variables
    target_variable_list = target_layer.variables
    for predict_variable, target_variable in zip(predict_variable_list, target_variable_list):
        assign_op = tf.assign(ref=target_variable, value=predict_variable)
        sess.run(assign_op)

In [10]:
print(sess.run(target_net, feed_dict={input_state: [[1]]}))

[[0.507277   0.12137053]]


## Learning Test

In [11]:
predict_action = tf.argmax(predict_net, axis=1)
predict_q_value = tf.reduce_max(predict_net, axis=1)

In [12]:
print(sess.run(predict_action, feed_dict={input_state: [[1]]}))
print(sess.run(predict_q_value, feed_dict={input_state: [[1]]}))

[0]
[0.507277]


In [13]:
experience = [1, 0, -1, 2, False]

In [14]:
target_q_value = tf.reduce_max(target_net, axis=1)

In [15]:
print(sess.run(target_q_value, feed_dict={input_state: [[1]]}))

[0.507277]


In [16]:
tf_reward = tf.placeholder(shape=[None, 1], dtype=tf.float32)

In [17]:
y = tf.add(tf_reward, target_q_value)

In [18]:
print(sess.run(y, feed_dict={input_state: [[1]], tf_reward: [[experience[2]]]}))

[[-0.492723]]


In [19]:
loss = tf.losses.mean_squared_error(predictions=predict_q_value, labels=y[0])

In [20]:
print(sess.run(loss, feed_dict={input_state: [[1]], tf_reward: [[experience[2]]]}))

1.0


In [21]:
train = tf.train.GradientDescentOptimizer(0.01).minimize(loss)

In [22]:
for i in range(100):
  _, loss_value = sess.run((train, loss), feed_dict={input_state: [[1], [2], [3], [4]],
                                                     tf_reward: [[-1], [20], [-1], [-1]]})
  print(loss_value)

1.0
0.5557573
0.36027858
0.2602859
0.20458534
0.17175789
0.15157658
0.13871826
0.13024181
0.124451
0.12033624
0.11728244
0.114908025
0.11297298
0.11132486
0.1108696
0.10968442
0.10865992
0.10807277
0.106662445
0.10630287
0.105183646
0.10408578
0.10369764
0.10238564
0.10175186
0.10098032
0.09973967
0.09938598
0.09838202
0.09723218
0.09700884
0.09579592
0.09499042
0.09457454
0.09354406
0.09313699
0.09253445
0.09150589
0.09138288
0.0905114
0.089690536
0.08951609
0.088452496
0.08819915
0.08747854
0.08679997
0.08643759
0.08537757
0.08541115
0.08437812
0.08410912
0.08337147
0.082739726
0.082391724
0.081535116
0.0813808
0.08034004
0.08027365
0.07938866
0.07908096
0.078436404
0.07784341
0.07745394
0.07667856
0.076499015
0.075616956
0.075516015
0.0745208
0.074496225
0.0735982
0.07337348
0.07269667
0.07229336
0.07177359
0.07122156
0.070851035
0.0702427
0.069953
0.069203824
0.06904092
0.068206444
0.06812562
0.06723954
0.06723434
0.06634015
0.06634248
0.06544247
0.06538552
0.06461451
0.064396665
0

In [23]:
print(sess.run(predict_net, feed_dict={input_state: [[1]]}))

[[-0.08941148 -0.08794146]]


In [24]:
print(sess.run(target_net, feed_dict={input_state: [[1]]}))

[[0.507277   0.12137053]]


## Test the different net architect
Predict network need not only the current state input but also the action input

In [3]:
def create_predict_net(input_source, variable_scope, layer_list):
    with tf.variable_scope(variable_scope):
        layer1 = tf.layers.Dense(units=5, activation=tf.nn.relu, name="layer")
        layer_list.append(layer1)
        # Predict net takes 2 inputs, current state and action
        layer1_output = layer1(input_source)

        layer2 = tf.layers.Dense(units=5, activation=tf.nn.relu, name="layer")
        layer_list.append(layer2)
        layer2_output = layer2(layer1_output)

        output_layer = tf.layers.Dense(units=1, activation=None, name="layer")
        layer_list.append(output_layer)
        output = output_layer(layer2_output)

        return output

In [4]:
input_source = tf.placeholder(shape=(None, 2), dtype=tf.float32)

In [5]:
predict_layer_list = []
predict_net = create_predict_net(input_source=input_source, variable_scope="predict_net", layer_list=predict_layer_list)

In [6]:
init_op = tf.global_variables_initializer()
sess = tf.Session()

In [7]:
sess.run(init_op)

In [8]:
print(sess.run(predict_net, feed_dict={input_source: [[2, 1], [2, 0], [2, 2]]}))

[[ 0.        ]
 [-0.00300903]
 [-0.04383451]]


In [9]:
q_value = sess.run(predict_net, feed_dict={input_source: [[2, 1], [2, 0], [2, 2]]})

In [10]:
type(q_value)

numpy.ndarray

In [11]:
import numpy as np

In [12]:
print(np.argmax(q_value))

0


In [13]:
print(np.amax(q_value))

0.0


## Test numpy matrix creating

In [14]:
input_source = np.zeros((3, 2))
input_source

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

In [15]:
action_list = np.arange(3)
action_list

array([0, 1, 2])

In [16]:
current_state = 101
current_state

101

In [17]:
for i in range(3):
    input_source[i] = (current_state, i)
input_source

array([[101.,   0.],
       [101.,   1.],
       [101.,   2.]])

In [18]:
from sklearn import preprocessing

In [19]:
input_source_normalised = preprocessing.normalize(input_source)
input_source_normalised

array([[1.        , 0.        ],
       [0.99995099, 0.0099005 ],
       [0.999804  , 0.0197981 ]])

In [24]:
temp = [[1]]
temp_np_array = np.asarray(temp)
temp_np_array

array([[1]])

In [25]:
import gym

In [27]:
env = gym.make("Taxi-v2")

In [29]:
print(env.action_space)
print(env.observation_space)

Discrete(6)
Discrete(500)
