In [5]:
import numpy as np
import pickle
import tensorflow as tf
import sys

if sys.platform == "win32":
    sys.path.append(r"C:\Users\vik\Dropbox\Code\Python\structural_engineering")
else:
    sys.path.append("/home/ritchie46/Dropbox/Code/Python/structural_engineering")

from anastruct.fem.system import SystemElements
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [6]:

class Environment:
    def __init__(self, length=3, height=2, optimize='moment'):
        self.length = length
        self.height = height
        self.state = None
        self.n = None
        self.actions_chosen = None
        self.no_action = None
        self.action_space = {0, 1, 2, 3, 4, 5, 6, 7}
        self.valid_actions = None
        self.optimize = optimize
        self.result_map = {}
        self.ss = None
        self.current_distance = length - 1

        # actions
        right = 0
        left = 4
        up = 2
        down = 6
        up_right = 1
        up_left = 3
        down_right = 7
        down_left = 5
        
        # If the state is a flattened array. This maps to the index displacements.
        self.move_map = {right: 1,
                         left: -1,
                         up: -length,
                         down: length,
                         up_right: -length + 1,
                         up_left: -length - 1,
                         down_right: length + 1,
                         down_left: length -1}
            
    def reset(self):
        self.state = np.zeros((self.height, self.length))
        self.n = 1
        self.actions_chosen = 0
        self.state[-1][0] = self.n
        self.det_valid_actions()
        return self.state.ravel()
    
    def return_action(self, r):
        done = False
        self.det_valid_actions()
        s = np.array(self.state.ravel())
        i = np.where(s == self.n)
        s[s > 0] = -1
        s[i] = 1
                
        # Bridge is build
        if self.state[-1][-1] != 0:
            r = r + 5 - self.structure() / (0.5*5*self.length**2) # that is moment to the power 2
            done = True
            return s, r , done
        
        if len(self.valid_actions) == 0:
            done = True
            r -= 2
        
        return s, r  , done
    
    def det_valid_actions(self):
        no_action = set()
        right = 0
        left = 4
        top = 2
        down = 6
        top_right = 1
        top_left = 3
        down_right = 7
        down_left = 5
        
        # current location
        row, col = np.where(self.state == self.n)
                
        # right:
        try:
            if self.state[row, col + 1] != 0:
                no_action.add(right)
        except IndexError:
            no_action.add(right)
 
        if col - 1 < 0:
            no_action.add(left)
        elif self.state[row, col - 1] != 0:
            no_action.add(left)

        if row - 1 < 0:
            no_action.add(top)
        elif self.state[row - 1, col] != 0:
            no_action.add(top)
            
        try:
            if self.state[row + 1, col] != 0:
                no_action.add(down)
        except IndexError:
            no_action.add(down)
            
        if col -1 < 0 or row + 1 == self.height:
            no_action.add(down_left)
        elif self.state[row + 1, col - 1] != 0:
            no_action.add(down_left)

        try:
            if self.state[row + 1, col + 1] != 0:
                no_action.add(down_right)
        except IndexError:
            no_action.add(down_right)
            
        if row - 1 < 0 or col - 1 < 0:
            no_action.add(top_left)
        elif self.state[row -1, col - 1] != 0:
            no_action.add(top_left)

        if row - 1 < 0 or col + 1 == self.length:
            no_action.add(top_right)
        elif self.state[row - 1, col + 1] != 0:
                no_action.add(top_right)
            
        self.no_action = no_action
        self.valid_actions = list(self.action_space - no_action)
        
    
    def step(self, a):
        """
        :param a: (int) action direction
        
        → 0
        ↗ 1
        ↑ 2
        ↖ 3
        ← 4
        ↙ 5
        ↓ 6
        ↘ 7
        """
        self.actions_chosen += 1

        flat_location_index = np.argwhere(self.state.ravel() == self.n)
                                
        if a in self.no_action:
            return self.return_action(-0.2)
        
        # there is a valid action, make a move
        self.n += 1
        move = self.move_map[a]
        self.state.ravel()[flat_location_index + move] = self.n
        
        row, col = np.where(self.state == self.n)
        y = self.height - 1 - row[0]
        x = self.length - 1  - col[0]
        
#         distance = (x**2 + y**2)**0.5
#         d_distance = self.current_distance - distance
#         self.current_distance = distance

        return self.return_action(-0.1 )
    
    def structure(self):
        self.ss = SystemElements()
        last_loc = [0, 0]
        for i in range(2, self.n + 1):
            row, col = np.where(i  == self.state)
            
            y = self.height - 1 - row[0]
            x = col[0] 

            current_loc = [x, y]
            self.ss.add_element([last_loc, [x, y]])
            last_loc = current_loc
        
        n_nodes = len(self.ss.node_map)
        forces = -1 / (n_nodes - 2)
        for i in range(2, n_nodes):
            self.ss.point_load(node_id=i, Fz=forces)
  
        self.ss.add_support_hinged(1)
        self.ss.add_support_hinged(len(self.ss.node_map))
        self.ss.solve()
        
        f_max = np.max(np.abs(self.ss.get_element_result_range(self.optimize)))
        
        return f_max
    
    def sample_action(self):
        return np.random.randint(0, 8) #self.valid_actions[np.random.randint(0, len(self.valid_actions))]

    

        
def test_env():
    env = Environment(4, 4)
    s = env.reset()

    print(env.step(0))
    print(env.state)
    print(env.step(2))
    print(env.state)
    print(env.step(1))
    print(env.state)
    print(env.step(7))
    print(env.state)
    print(env.step(6))
    print(env.state)

#     env.step(0)
#     print(env.state, "\n")
#     env.step(7)
#     print(env.state)
#     env.structure()
    
test_env()

(array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.,
        1.,  0.,  0.]), -0.1, False)
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 1.  2.  0.  0.]]
(array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0., -1.,
       -1.,  0.,  0.]), -0.1, False)
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  3.  0.  0.]
 [ 1.  2.  0.  0.]]
(array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0., -1.,  0.,  0., -1.,
       -1.,  0.,  0.]), -0.1, False)
[[ 0.  0.  0.  0.]
 [ 0.  0.  4.  0.]
 [ 0.  3.  0.  0.]
 [ 1.  2.  0.  0.]]
(array([ 0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0., -1.,  0.,  1., -1.,
       -1.,  0.,  0.]), -0.1, False)
[[ 0.  0.  0.  0.]
 [ 0.  0.  4.  0.]
 [ 0.  3.  0.  5.]
 [ 1.  2.  0.  0.]]
(array([ 0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0., -1.,  0., -1., -1.,
       -1.,  0.,  1.]), 4.8895833333333334, True)
[[ 0.  0.  0.  0.]
 [ 0.  0.  4.  0.]
 [ 0.  3.  0.  5.]
 [ 1.  2.  0.  6.]]


In [7]:
# https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/

def leaky_relu(x, alpha=0.1):
    return tf.maximum(x, alpha * x)

class Agent:
    def __init__(self, data_size, hidden_size, action_space, learning_rate, name):
        """
        :param data_size: (int) Columns of the data vector.
        :param hidden_size: (int) No. of hidden nodes.
        :param action_space: (int) No. of outputs.
        :param learning_rate: (flt)
        """
      
        if len(hidden_size) == 1:
            one_hidden_layer = True
            hidden_size.append(hidden_size[0])
        else:
            one_hidden_layer = False

        
        # Step 1: Feed forward
        # The argmax is the maximum Q-value.
        self.input_s = tf.placeholder(tf.float32, [None, data_size], name="input_s")
        self.w1 = tf.get_variable(f"{name}_w1", shape=[data_size, hidden_size[0]], initializer=tf.contrib.layers.xavier_initializer())
        self.b1 = tf.get_variable(f"{name}_b1", shape=(hidden_size[0], ), initializer=tf.zeros_initializer())
        self.layer_1 = leaky_relu(tf.matmul(self.input_s, self.w1) + self.b1)
        
        self.w2 = tf.get_variable(f"{name}_w2", shape=[hidden_size[0], hidden_size[1]], initializer=tf.contrib.layers.xavier_initializer())
        self.b2 = tf.get_variable(f"{name}_b2", shape=(hidden_size[1], ), initializer=tf.zeros_initializer())
        self.layer_2 = leaky_relu(tf.matmul(self.layer_1, self.w2) + self.b2)
                     
        self.w_out = tf.get_variable(f"{name}_w_out", shape=[hidden_size[1], action_space], initializer=tf.contrib.layers.xavier_initializer())
        self.b_out = tf.get_variable(f"{name}_b_out", shape=(action_space, ), initializer=tf.zeros_initializer())
        
        if one_hidden_layer:
            hidden_layer = self.layer_1
        else:
            hidden_layer = self.layer_2
        
        # argmax(Q(s, a)) 
        self.predict_Q = tf.matmul(hidden_layer, self.w_out) + self.b_out # actual Q-value
        self.p = tf.nn.softmax(self.predict_Q)
        self.Q_a = tf.argmax(self.predict_Q, 1)
        self.saver = tf.train.Saver()
        
        # Step 2: Determine loss / gradients. 
        # One hot encoded actions
        self.executed_actions = tf.placeholder(tf.int32)
        
        self.one_hot = tf.one_hot(self.executed_actions, 8)
        self.Q = tf.reduce_sum(tf.multiply(self.predict_Q, self.one_hot), axis=1)
        self.next_Q_r = tf.placeholder(tf.float32)

         # Loss
         # mse: (     target      -    prediction)^2
         #      r + max(Q(s', a') -    Q(s, a) )^2
        
        #self.loss = tf.losses.huber_loss(self.next_Q_r, - self.Q, delta=2)
#         self.clipped_error = tf.maximum(tf.abs(self.next_Q_r - self.Q), tf.ones(tf.shape(self.Q)))
#         self.loss = tf.reduce_sum(tf.square(self.clipped_error))       
        self.loss = tf.reduce_sum(tf.square(self.next_Q_r - self.Q))   
        optimizer = tf.train.AdamOptimizer(learning_rate)
        self.train_count = tf.Variable(0, trainable=False)
        self.train = optimizer.minimize(self.loss, self.train_count)
        
        
        
        
class FrozenAgent:
    def __init__(self, data_size):
        """
        :param data_size: (int) Columns of the data vector.
        """
        # Step 1: Feed forward
        # The argmax is the maximum Q-value.
        self.input_s = tf.placeholder(tf.float32, [None, data_size], name="input_s")
        
        # weights and biases
        self.wb = tuple([tf.placeholder(dtype=tf.float32) for _ in range(6)])
        
        w1 = self.wb[0]
        b1 = self.wb[1]
        w2 = self.wb[2]
        b2 = self.wb[3]
        w_out = self.wb[4]
        b_out = self.wb[5]
        
        self.layer_1 = leaky_relu(tf.matmul(self.input_s, w1) + b1)
#         self.layer_2 = leaky_relu(tf.matmul(self.layer_1, w2) + b2)
             
        self.predict_Q = tf.matmul(self.layer_1, w_out) + b_out # actual Q-value


In [155]:
def discounted_reward(r, gamma):
    """
    The reward for a given state. Is the reward for that state + the discounted sum of future rewards.
    
    :param r: (array) Rewards.
    :param gamma: (flt) Discount factor
    """
    return np.cumsum(r * gamma**(np.arange(len(r)))[::-1])[::-1]

def prepare_update_target(trainables, tau=0.05):
    """
    The weights and biases of the target will be a depended of the primary network.
    
    wb[target] = t
    
    This is a tensorflow operation and still needs to be run with Session.run(operation_holder)
    
    :trainables (tf.trainable_variables)
    :tau (flt) Rate to update the target graph
    """
    
    operation_holder = []
    n_variables = len(trainables) // 2 # the agent has half and the target has half
    
    for i, v in enumerate(trainables[0: n_variables]):
        operation_holder.append(trainables[i + n_variables].assign(v.value() * tau) +
                                ((1 - tau) * trainables[i + n_variables].value()))
    return operation_holder

def update_target(operation_holder, session):
    for op in operation_holder:
        session.run(op)
    

In [157]:

from collections import deque
# %matplotlib inline
# %matplotlib notebook
# # %load_ext autoreload
# # %autoreload 2


# fig = plt.figure(figsize=(12, 6))
# ax = fig.add_subplot(111)
# fig.show()
# fig.canvas.draw()

env = Environment(3, 2, "moment")
env.reset()

H = [16] # hidden neurons
D = env.state.size # input (state of the environment)
learning_rate = 1e-3
gamma = 0.99 # discount factor
epochs = 50000
max_frames = 500
action_space = 8

contin = 0

if not contin:
    eps = 0.3
    tf.reset_default_graph()
    agent = Agent(D, H, action_space, learning_rate, "agent")
    target = Agent(D, H, action_space, 0, "target")
    
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    buffer = deque()
    
    # the first half of the list are the variables of the agent.
    # The last half the variables of the target
    variables = tf.trainable_variables() 
    operation_holder = prepare_update_target(variables)
    

scores= []
n_done = 0

last_ep = 0


#https://github.com/awjuliani/DeepRL-Agents/blob/master/Q-Network.ipynb
train_count = 0
target_update_count = 0
loss = 0
variable_update_help = None
for ep in range(epochs):
    if eps > 0.001:
        eps *= 0.999

    if (ep + 1) % 100 == 0:
        print(np.mean(scores), "train_count", train_count, "target_count", target_update_count,
              "loss", loss, "eps", eps)
        scores = []
        
    s = env.reset()
    s = [s]
    for c in range(max_frames):
        
        Q = sess.run(agent.predict_Q, {agent.input_s: s})
        
        if np.random.rand(1) < eps:
            a = env.sample_action()
        else:
            a = np.argmax(Q)
        
        s_new, r, done = env.step(a)
        scores.append(r)

        buffer.append([s, a, r, s_new, done])
        
        if len(buffer) > 5000:
            buffer.pop()
        s = [s_new]
        
        if done:
        
            if len(buffer) >= 5000:
                batch = np.vstack(buffer)
                batch = batch[np.random.randint(0, 5000, size=1000)]

                s = np.vstack(batch[:, 0])
                s_new = np.vstack(batch[:, 3])
                r = batch[:, 2]
                a = batch[:, 1]
                done_ = np.array(batch[:, 4], dtype=bool)
                Q = sess.run(agent.predict_Q, {agent.input_s: s})
                Q_new = sess.run(target.predict_Q, {target.input_s: s_new})
                max_Q_new = np.max(Q_new, 1)

                target_Q = (r + gamma * max_Q_new)
                target_Q[done_] = r[done_]
  
                train_count, loss, _ = sess.run([agent.train_count, agent.loss, agent.train], 
                                                feed_dict={agent.input_s: s, 
                                                           agent.executed_actions: a, 
                                                           agent.next_Q_r: target_Q})

                if c % 3 == 0:
                    # update target network
                    target_update_count += 1
                    update_target(operation_holder, sess)
        
    
                                   
            break
            



0.116247427468 train_count 0 target_count 0 loss 0 eps 0.2714376441341128
0.0162883100961 train_count 0 target_count 0 loss 0 eps 0.2455946488435909
0.0599824019159 train_count 63 target_count 20 loss 770.631 eps 0.22221210964683
0.148132212666 train_count 163 target_count 43 loss 416.57 eps 0.20105577180202236
1.67631473098 train_count 263 target_count 48 loss 221.547 eps 0.18191368345835565


KeyboardInterrupt: 

In [161]:
agent.saver.save(sess, "/home/ritchie46/Downloads/model_anastruct/model_bridge_4_2_moment.ckpt")


'/home/ritchie46/Downloads/model_anastruct/model_bridge_4_2_moment.ckpt'

In [19]:
agent.saver.save(sess, r"G:\bridge_builder\model_bridge_4_2_axial\model.ckpt")

'G:\\bridge_builder\\model_bridge_4_2_axial\\model.ckpt'

In [20]:
env = Environment(3, 2)
s = env.reset()


"""    
    → 0
    ↗ 1
    ↑ 2
    ↖ 3
    ← 4
    ↙ 5
    ↓ 6
    ↘ 7
"""

bot = target

total_r = 0
j = 0
for a in [1, 0, 0, 0, 0]:
    j += 1

    print("\n", env.state)
    
    
    a_dst = sess.run(bot.predict_Q, {bot.input_s: [s]})
    a = np.argmax(a_dst)
    #a = np.random.choice(np.arange(8), p=a_dst[0])

    s, r, d = env.step(a)

    print(a_dst)
    print("action", a)
    print(r)
    total_r += r
    
#     if d == True:
#         j = 0
#         print(env.state)
#         break
#         #env.reset()
    
print("\r", total_r, end="")


 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
[[ 0.0157699  -0.00472315 -0.00631001 -0.0119287  -0.01288951 -0.00707795
   0.01728024  0.0105312 ]]
action 6
-0.2

 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
[[ 0.0157699  -0.00472315 -0.00631001 -0.0119287  -0.01288951 -0.00707795
   0.01728024  0.0105312 ]]
action 6
-0.2

 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
[[ 0.0157699  -0.00472315 -0.00631001 -0.0119287  -0.01288951 -0.00707795
   0.01728024  0.0105312 ]]
action 6
-0.2

 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
[[ 0.0157699  -0.00472315 -0.00631001 -0.0119287  -0.01288951 -0.00707795
   0.01728024  0.0105312 ]]
action 6
-0.2

 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
[[ 0.0157699  -0.00472315 -0.00631001 -0.0119287  -0.01288951 -0.00707795
   0.01728024  0.0105312 ]]
action 6
-0.2
 -1.0

In [82]:
#update_target(operation_holder, sess)
sess.run(variables[0].value() - variables[6].value())

array([[-1.47237194, -0.45450115, -1.1099143 ,  0.40738916, -0.14500076,
        -1.27405345, -1.88907146, -1.09437132, -1.61892748, -1.89695144,
        -0.40908128, -0.8260721 , -1.37111664, -1.85722709, -0.28329837,
         0.00418878],
       [-0.82942849, -0.06093037, -0.98205984, -1.03273547, -0.73091006,
        -1.58550084,  0.10534322, -0.96237028, -0.77367294, -0.5560708 ,
        -0.57345325, -1.33160257, -0.61364865,  0.44076538, -0.6399169 ,
        -1.78533554],
       [-0.37469065, -1.39439011, -1.14717066, -0.95678383, -0.83726352,
        -0.88811439, -1.0354259 , -1.96661639, -0.09760374, -1.15479875,
        -0.92978543, -1.10216045, -0.7275157 , -0.92905378,  0.5746758 ,
        -0.68988132],
       [-1.44932294, -0.95880038, -0.84693551, -0.9915266 , -1.32087588,
        -1.57927155, -1.47370005, -0.30102974, -1.3647579 , -0.76708555,
        -1.69573951, -1.70829821, -1.13219559, -1.22856677, -0.95958227,
        -0.4713431 ],
       [-1.27410245, -0.45723015,  0

In [13]:
env = Environment(5, 4)
s = env.reset()
actions = s[-8:]
actions

array([ 1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.])

In [14]:
s = env.step(0)[0]
s = env.step(1)[0]
s = env.step(4)[0]
s = env.step(1)[0]
actions = s[-8:]
print(actions)
print(env.valid_actions)
env.state

[ 1.  1.  1.  1.  1.  0.  0.  1.]
[0, 1, 2, 3, 4, 7]


array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  5.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.],
       [ 1.,  2.,  0.,  0.,  0.]])

In [39]:
scores

[array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.2]),
 array([-1.1266901]),
 array([-1.2266901]),
 array([-1.2266901]),
 array([-1.2266901]),
 array([-1.2266901]),
 array([-1.2266901]),
 array([-1.2266901]),
 array([-1.2266901]),
 array([-1.2266901]),
 array([-0.96334002]),
 array([-1.19628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.29628939]),
 array([-1.296289

In [149]:

"""    
    → 0
    ↗ 1
    ↑ 2
    ↖ 3
    ← 4
    ↙ 5
    ↓ 6
    ↘ 7
"""
env.no_action

{5, 6, 7}