In [2]:
import numpy as np
import pickle
import tensorflow as tf
import sys

if sys.platform == "win32":
    sys.path.append(r"C:\Users\vik\Dropbox\Code\Python\structural_engineering")
else:
    sys.path.append("/home/ritchie46/Dropbox/Code/Python/structural_engineering")

from anastruct.fem.system import SystemElements
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [37]:

class Environment:
    def __init__(self, length=3, height=2, optimize='moment'):
        self.length = length
        self.height = height
        self.state = None
        self.n = None
        self.actions_chosen = None
        self.no_action = None
        self.action_space = {0, 1, 2, 3, 4, 5, 6, 7}
        self.valid_actions = None
        self.optimize = optimize
        self.result_map = {}
        
        # actions
        right = 0
        left = 4
        up = 2
        down = 6
        up_right = 1
        up_left = 3
        down_right = 7
        down_left = 5
        
        # If the state is a flattened array. This maps to the index displacements.
        self.move_map = {right: 1,
                         left: -1,
                         up: -length,
                         down: length,
                         up_right: -length + 1,
                         up_left: -length - 1,
                         down_right: length + 1,
                         down_left: length -1}
            
    def reset(self):
        self.state = np.zeros((self.height, self.length))
        self.n = 1
        self.actions_chosen = 0
        self.state[-1][0] = self.n
        self.det_valid_actions()
        
        return self.state.ravel()
#         # valid action encoding
#         a = np.zeros(8)
#         a[np.array(self.valid_actions)] = 1
 
#         return np.concatenate((self.state.ravel(), a))
    
    def return_action(self, r):
        done = False
        
        # Bridge is build
        if self.state[-1][-1] != 0:
            r = r + 10 - self.structure()**2 # that is moment to the power 2
            done = True
            return self.state, r, done

        s = self.state / np.max(self.state)
        zero_mask = np.where(s == 0)
        s[s < 1] = 0.5
        s[zero_mask] = 0
        self.det_valid_actions()
        
        # valid action encoding
        a = np.zeros(8)
        
        try:
            a[np.array(self.valid_actions)] = 1
        except IndexError:
            done = True
            r -= 2
        
        return self.state.ravel(), r, done
            
#         return np.concatenate((self.state.ravel(), a)), r, done
    
    def det_valid_actions(self):
        no_action = set()
        right = 0
        left = 4
        top = 2
        down = 6
        top_right = 1
        top_left = 3
        down_right = 7
        down_left = 5
        
        # current location
        row, col = np.where(self.state == self.n)
                
        # right:
        try:
            if self.state[row, col + 1] != 0:
                no_action.add(right)
        except IndexError:
            no_action.add(right)
 
        if col - 1 < 0:
            no_action.add(left)
        elif self.state[row, col - 1] != 0:
            no_action.add(left)

        if row - 1 < 0:
            no_action.add(top)
        elif self.state[row - 1, col] != 0:
            no_action.add(top)
            
        try:
            if self.state[row + 1, col] != 0:
                no_action.add(down)
        except IndexError:
            no_action.add(down)
            
        if col -1 < 0 or row + 1 == self.height:
            no_action.add(down_left)
        elif self.state[row + 1, col - 1] != 0:
            no_action.add(down_left)

        try:
            if self.state[row + 1, col + 1] != 0:
                no_action.add(down_right)
        except IndexError:
            no_action.add(down_right)
            
        if row - 1 < 0 or col - 1 < 0:
            no_action.add(top_left)
        elif self.state[row -1, col - 1] != 0:
            no_action.add(top_left)

        if row - 1 < 0 or col + 1 == self.length:
            no_action.add(top_right)
        elif self.state[row - 1, col + 1] != 0:
                no_action.add(top_right)
            
        self.no_action = no_action
        self.valid_actions = list(self.action_space - no_action)
        
    
    def step(self, a):
        """
        :param a: (int) action direction
        
        → 0
        ↗ 1
        ↑ 2
        ↖ 3
        ← 4
        ↙ 5
        ↓ 6
        ↘ 7
        """
        self.actions_chosen += 1

        flat_location_index = np.argwhere(self.state.ravel() == self.n)
                
        if a in self.no_action:
            return self.return_action(-0.2)
        
        # there is a valid action
        self.n += 1
        
        move = self.move_map[a]
        self.state.ravel()[flat_location_index + move] = self.n
            
        return self.return_action(-0.1)
    
    def structure(self):
        ss = SystemElements()
        last_loc = [0, 0]
        for i in range(2, self.n + 1):
            row, col = np.where(i  == self.state)
            
            y = self.height - 1 - row[0]
            x = col[0] 

            current_loc = [x, y]
            ss.add_element([last_loc, [x, y]])
            last_loc = current_loc
        
        n_nodes = len(ss.node_map)
        forces = -5 / (n_nodes - 2)
        for i in range(2, n_nodes):
            ss.point_load(node_id=i, Fz=forces)
  
        ss.add_support_hinged(1)
        ss.add_support_hinged(len(ss.node_map))
        ss.solve()
        
        f_max = np.max(np.abs(ss.get_element_result_range(self.optimize)))
#         if f_max not in self.result_map:
#             self.result_map[f_max] = ss
        
        return f_max

    

        
def test_env():
    env = Environment(4, 2)
    s = env.reset()
    print(s, s.size)
    print(env.step(1))

    env.step(0)
    print(env.state, "\n")
    env.step(7)
    print(env.state)
    env.structure()
    
test_env()

[ 0.  0.  0.  0.  1.  0.  0.  0.] 8
(array([ 0.,  2.,  0.,  0.,  1.,  0.,  0.,  0.]), -0.1, False)
[[ 0.  2.  3.  0.]
 [ 1.  0.  0.  0.]] 

[[ 0.  2.  3.  0.]
 [ 1.  0.  0.  4.]]


In [158]:
# https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/

class Agent:
    def __init__(self, data_size, hidden_size, action_space, learning_rate):
        """
        :param data_size: (int) Columns of the data vector.
        :param hidden_size: (int) No. of hidden nodes.
        :param action_space: (int) No. of outputs.
        :param learning_rate: (flt)
        """
        # Step 1: Feed forward
        # The argmax is the maximum Q-value.
        self.input_s = tf.placeholder(tf.float32, [None, data_size], name="input_s")
        self.w1 = tf.get_variable("w1", shape=[data_size, hidden_size[0]], initializer=tf.contrib.layers.xavier_initializer())
        self.b1 = tf.get_variable("b1", shape=(hidden_size[0], ), initializer=tf.zeros_initializer())
        self.layer_1 = tf.nn.relu(tf.matmul(self.input_s, self.w1) + self.b1)
        
        self.w2 = tf.get_variable("w2", shape=[hidden_size[0], hidden_size[1]], initializer=tf.contrib.layers.xavier_initializer())
        self.b2 = tf.get_variable("b2", shape=(hidden_size[1], ), initializer=tf.zeros_initializer())
        self.layer_2 = tf.nn.relu(tf.matmul(self.layer_1, self.w2) + self.b2)
                
        self.w_out = tf.get_variable("w_out", shape=[hidden_size[1], action_space], initializer=tf.contrib.layers.xavier_initializer())
        self.b_out = tf.get_variable("b_out", shape=(action_space, ), initializer=tf.zeros_initializer())
        
        # argmax(Q(s, a)) 
        self.predict_Q = tf.matmul(self.layer_2, self.w_out) + self.b_out # actual Q-value
        self.p = tf.nn.softmax(self.predict_Q)
        self.Q_a = tf.argmax(self.predict_Q, 1)
        self.saver = tf.train.Saver()

        
        # Step 2: Determine loss / gradients. 
        # One hot encoded actions
        self.executed_actions = tf.placeholder(tf.int32, name="executed_actions")
        
        self.one_hot = tf.one_hot(self.executed_actions, 8)
        self.Q = tf.reduce_sum(tf.multiply(self.predict_Q, self.one_hot), axis=1)
        self.next_Q_r = tf.placeholder(tf.float32, name="next_Q")

         # Loss
         # mse: (     target      -    prediction)^2
         #      r + max(Q(s', a') -    Q(s, a) )^2
        
        self.loss = tf.reduce_sum(tf.square(self.next_Q_r - self.Q))        
        optimizer = tf.train.AdamOptimizer(learning_rate)
        self.train_count = tf.Variable(0, trainable=False)
        self.train = optimizer.minimize(self.loss, self.train_count)


In [5]:
def discounted_reward(r, gamma):
    """
    The reward for a given state. Is the reward for that state + the discounted sum of future rewards.
    
    :param r: (array) Rewards.
    :param gamma: (flt) Discount factor
    """
    return np.cumsum(r * gamma**(np.arange(len(r)))[::-1])[::-1]


In [6]:
env = Environment(4, 2)
env.reset().size

8

In [None]:
from collections import deque
# %matplotlib inline
# %matplotlib notebook
# # %load_ext autoreload
# # %autoreload 2


# fig = plt.figure(figsize=(12, 6))
# ax = fig.add_subplot(111)
# fig.show()
# fig.canvas.draw()


env = Environment(3, 1, "moment")

H = [16, 16] # hidden neurons
D = 3 # input (state of the environment)
learning_rate = 1e-3
gamma = 0.99 # discount factor
epochs = 50000
max_frames = 50
action_space = 8

contin = 0

if not contin:
    tf.reset_default_graph()
    agent = Agent(D, H, action_space, learning_rate)
    init = tf.global_variables_initializer()
    #with tf.Session() as sess:
    sess = tf.Session()
    sess.run(init)
    buffer = deque()

scores= []
n_done = 0

last_ep = 0

#https://github.com/awjuliani/DeepRL-Agents/blob/master/Q-Network.ipynb
n_updates = 0
for ep in range(epochs):
    if (ep + 1) % 300 == 0:
        print(np.mean(scores[-500:]), "train_count", train_count, "loss", loss)
        
    s = env.reset()
    s = [s]
    for c in range(max_frames):
        
        p, Q = sess.run([agent.p, agent.predict_Q], {agent.input_s: s})
        
        a = np.random.choice(np.arange(8), p=p[0]) # choose an action index
        s_new, r, done = env.step(a)
        scores.append(r)

        buffer.append([s, a, r, s_new])
        
        if len(buffer) > 5000:
            buffer.pop()
        s = [s_new]
        
        if done:
                           
            if len(buffer) > 2000 and c % 5 == 0:
                batch = np.vstack(buffer)
                batch = batch[np.random.randint(0, 2000, size=1500)]

                s = np.vstack(batch[:, 0])
                s_new = np.vstack(batch[:, 3])
                r = batch[:, 2]
                a = batch[:, 1]  
                Q = sess.run(agent.predict_Q, {agent.input_s: s})
                Q_new = sess.run(agent.predict_Q, {agent.input_s: s_new})
                max_Q_new = np.max(Q_new, 1)

                target_Q = (r + gamma * max_Q_new)
                
                
                train_count, Q_, one_hot, loss, _ = sess.run([agent.train_count, agent.Q, agent.one_hot, agent.loss, agent.train], 
                                                feed_dict={agent.input_s: s, 
                                                           agent.executed_actions: a, 
                                                           agent.next_Q_r: target_Q})
                
#                 print(one_hot.shape)
#                 print(one_hot[0, :])
#                 print("predict_Q", Q[0,:])
#                 print(Q_.shape)
#                 print(Q_[0])
                

            break



0.3054 train_count 15 loss 3898.21
0.4478 train_count 70 loss 16990.0
0.7954 train_count 107 loss 48269.6
0.9927 train_count 130 loss 88762.8
1.2139 train_count 144 loss 129566.0
1.2376 train_count 145 loss 131087.0
1.2771 train_count 148 loss 135273.0
1.2852 train_count 152 loss 153039.0
1.3482 train_count 154 loss 163115.0
1.3247 train_count 158 loss 173414.0
1.4037 train_count 159 loss 180652.0


In [140]:
np.sum(Q * np.eye(8)[np.array(a, dtype=int)], 1)

array([ 0.26007631, -0.34215826,  0.26007631, ...,  0.52565402,
        0.80317271,  0.80317271])

In [138]:
x = Q * np.eye(8)[np.array(a, dtype=int)]
x

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.        ,  0.26007631],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.34215826,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.        ,  0.26007631],
       ..., 
       [ 0.        ,  0.52565402,  0.        , ...,  0.        ,
        -0.        ,  0.        ],
       [ 0.80317271,  0.        ,  0.        , ...,  0.        ,
        -0.        ,  0.        ],
       [ 0.80317271,  0.        ,  0.        , ...,  0.        ,
        -0.        ,  0.        ]])

In [136]:
a

7

In [161]:
agent.saver.save(sess, "/home/ritchie46/Downloads/model_anastruct/model_bridge_4_2_moment.ckpt")


'/home/ritchie46/Downloads/model_anastruct/model_bridge_4_2_moment.ckpt'

In [19]:
agent.saver.save(sess, r"G:\bridge_builder\model_bridge_4_2_axial\model.ckpt")

'G:\\bridge_builder\\model_bridge_4_2_axial\\model.ckpt'

In [124]:
s = env.reset()


"""    
    → 0
    ↗ 1
    ↑ 2
    ↖ 3
    ← 4
    ↙ 5
    ↓ 6
    ↘ 7
"""

total_r = 0
j = 0
for a in [0, 0, 1, 1, 0, 0, 0]:
    j += 1

    print("\n", env.state)
    a_dst = sess.run(agent.predict_Q, {agent.input_s: [s]})
    a = np.argmax(a_dst)
    #a = np.random.choice(np.arange(8), p=a_dst[0])

    s, r, d = env.step(a)
    print(a)
    print(a_dst)
    total_r += r
    
#     if d == True:
#         j = 0
#         print(env.state)
#         break
#         #env.reset()
    
print("\r", total_r, end="")


 [[ 1.  0.  0.]]
5
[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766
   0.0517213  -0.13007079]]

 [[ 1.  0.  0.]]
5
[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766
   0.0517213  -0.13007079]]

 [[ 1.  0.  0.]]
5
[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766
   0.0517213  -0.13007079]]

 [[ 1.  0.  0.]]
5
[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766
   0.0517213  -0.13007079]]

 [[ 1.  0.  0.]]
5
[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766
   0.0517213  -0.13007079]]

 [[ 1.  0.  0.]]
5
[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766
   0.0517213  -0.13007079]]

 [[ 1.  0.  0.]]
5
[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766
   0.0517213  -0.13007079]]
 -1.4

In [50]:
np.random.random(8)

array([ 0.76904085,  0.296608  ,  0.9456555 ,  0.73741889,  0.83164985,
        0.82735085,  0.58143395,  0.64800572])

In [13]:
env = Environment(5, 4)
s = env.reset()
actions = s[-8:]
actions

array([ 1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.])

In [14]:
s = env.step(0)[0]
s = env.step(1)[0]
s = env.step(4)[0]
s = env.step(1)[0]
actions = s[-8:]
print(actions)
print(env.valid_actions)
env.state

[ 1.  1.  1.  1.  1.  0.  0.  1.]
[0, 1, 2, 3, 4, 7]


array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  5.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.],
       [ 1.,  2.,  0.,  0.,  0.]])

In [55]:
noise = np.random.random(8)
noise /= noise.sum()
noise.sum()

1.0

In [149]:

"""    
    → 0
    ↗ 1
    ↑ 2
    ↖ 3
    ← 4
    ↙ 5
    ↓ 6
    ↘ 7
"""
env.no_action

{5, 6, 7}