In [10]:
import tensorflow as tf #deep learning lib
import numpy as np  #handle matrices
import robotEnvGen #robot sim environment
import pickle
from robot_sim_vm_vp_dynamic_drate_w import S1Wrapper as S1Wrapper # s1 classifier lib
import random
from collections import deque

In [2]:
def create_environment(sim_time):
    periods = [1,2,3,5] # available periods
    motors = [150,250,400,500] #available motors
    sss_clf = 'sk20_clf/sss_p1_e20_NNClf.p' # sss classifier for safety assurance
    sss_scaler = 'sk20_clf/sss_p1_e20_ClfScaler.p'
    env = 20 # disturbance magnitude- maximum
    s1_clf=[] #s1 classifier holders
    for i in periods: # fill s1 classifier holder with classifier and other parameters for each period and motor
        clf_p = {'period':i, 'clfs_m':[]}
        for j in motors:
            A = np.genfromtxt('dis_para/dis-paraAp'+str(i)+'.csv',delimiter=',')
            B = np.genfromtxt('dis_para/dis-paraBp'+str(i)+'.csv',delimiter=',')
            clf = pickle.load(open('sk20_clf/sss_p'+str(i)+'_e'+str(env)+'_NNClf.p','rb'))
            clf_scaler = pickle.load(open('sk20_clf/sss_p'+str(i)+'_e'+str(env)+'_ClfScaler.p','rb'))
            s1clf = S1Wrapper(A, B, j, env, clf, clf_scaler)
            clf_p['clfs_m'].append({'motor':j, 'clf':s1clf})
        s1_clf.append(clf_p)
    h = 0.001 #step size in second
    initial_state = np.asarray([[-0.2,0.1,-0.1,4.0,-3.5,-4.0]])
    robot_env = robotEnvGen.robotEnv(h,sss_clf,sss_scaler,s1_clf,initial_state)
    return robot_env, robot_env.periods # sim_env and action
    

In [3]:
sim_time = 2
sim, sim_action = create_environment(sim_time)

In [5]:
state_size = [6] #input is the current physical state
action_size = len(sim_action)
learning_rate = 0.0002 # alpha

#### training parameters
total_episodes = 500 # total no. of episode
batch_size = 64

##### exploration parameter for epsilon greedy strategy
explore_start = 1.0
explore_stop = 0.01
decay_rate = 0.0001

#Q learning parameter
gamma = 0.95 # discounting rate

#memory parameters
pertain_length = batch_size # number of experience stored in memory when initized
memory_size = 1000000 # no. of experience kept

training = True



In [6]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetworks'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name='inputs')
            self.actions_ = tf.placeholder(tf.float32, [None, action_size], name='actions_')
            self.target_Q = tf.placeholder(tf.float32, [None], name='target')
            self.fc1 = tf.layers.dense(inputs = self.inputs_,
                                units = state_size[0]*2,
                                activation = tf.nn.relu,
                                kernel_initializer =tf.contrib.layers.xavier_initializer(), name='fc1')
            
            self.fc2 = tf.layers.dense(inputs = self.fc1,
                                units = action_size,
                                activation = tf.nn.relu,
                                kernel_initializer =tf.contrib.layers.xavier_initializer(), name='fc2')
            self.output = tf.layers.dense(inputs = self.fc2,
                                units = action_size,
                                activation = None,
                                kernel_initializer =tf.contrib.layers.xavier_initializer())
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [7]:
tf.reset_default_graph()
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

In [11]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
        
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self,batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arrange(buffer_size), size = batch_size, replace = false)
        return [self.buffer[i] for i in index]

In [14]:
memory = Memory(max_size = memory_size)
sim.reset(np.asarray([[-0.2,0.1,-0.1,4.0,-3.5,-4.0]]))
for i in range(pertain_length):
    if i==0:
        state = sim.robot.state
    action = random.choice(sim_action)
    next_state, reward, done = sim.period_run(1,action)
    memory.add((state,action, reward, next_state, done))
    if done:
        sim.reste(np.asarray([[-0.2,0.1,-0.1,4.0,-3.5,-4.0]]))
        state = sim.get_state()
    else:
        state = next_state

In [16]:
memory.buffer[1]

(array([-0.196    ,  0.0965   , -0.104    ,  3.9881067, -3.4676091,
        -4.0279389]),
 3,
 0.21961252776465076,
 array([-0.19201189,  0.09303239, -0.10802794,  3.97652669, -3.43591778,
        -4.05562767]),
 0.0)

In [17]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        action = random.choice(possible_actions)
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
                
    return action, explore_probability

In [None]:
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        
        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0

        # Init the game
        game.init()

        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            game.new_episode()
            state = game.get_state().screen_buffer
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True)

            while step < max_steps:
                step += 1
                
                # Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)

                # Do the action
                reward = game.make_action(action)

                # Look if the episode is finished
                done = game.is_episode_finished()
                
                # Add the reward to total reward
                episode_rewards.append(reward)

                # If the game is finished
                if done:
                    # the episode ends so no next state
                    next_state = np.zeros((84,84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(explore_probability))

                    memory.add((state, action, reward, next_state, done))

                else:
                    # Get the next state
                    next_state = game.get_state().screen_buffer
                    
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    

                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))
                    
                    # st+1 is now our current state
                    state = next_state


                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []

                 # Get Q values for next_state 
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                    feed_dict={DQNetwork.inputs_: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork.actions_: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")