# Dagger RL Simple Implementation  
Dagger is a reinforcement learning algorithm for imitation learning/behaviour cloning. Introdiced in paper https://www.cs.cmu.edu/~sross1/publications/Ross-AIStats11-NoRegret.pdf  
It uses initial expert knowledge (usually human labeled data) to perform surprevised learning of agent's policy (mapping from observations to actions). Main trick in Dagger is that after agent learns initial policy (from expert data), it uses that policy to act in real environemnt and stores those experiences (observations). These real observations are then passed to expert to be labeled (add expert's actions), and are added to the training set. Then agent policy is trained again, this time on the new augmented data set, and cycle is repeated.        
  
Main trick in Dagger is dataset augmentation from agent's own experince. Inital expert dataset is limited and it is very likely that agent will diverge from the expert's path and encounter new states. Initial policy is almost useless in those new situations. By obtaining expert labels for those new observations and retraining the policy, agent becomes more robust to path perturbations.  
  
This implementation is made for UC Berkely course CS 294 Deep Reinforcement Learning. It is a naive implementation (still unfinished curentlly) by extending previous ordinary imitation learning technique, uses provided expert policy for gathering expert's dataset, and acts in MuJoCo environment.     

In [None]:
import cPickle as pickle
import numpy as np
import random
import math
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def data_preprocessing(train_data):
    """Data preprocessing - mean substraction and normalization"""
    
    data_mean = np.mean(train_data['observations'], axis = 0)
    train_data['observations'] -= data_mean
    input_dim = train_data['observations'].shape[1]
    action_dim = train_data['actions'].shape[2]
    data_combined = zip(train_data['observations'], train_data['actions'])
    return data_mean, train_data, input_dim, action_dim, data_combined

In [None]:
# Netowrk definition 

class Network():
    def __init__(self, input_dim, action_dim, hidden1_units, hidden2_units, regularization = False, beta = 0.01):
        """Network definition"""
        
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.h1_num = hidden1_units
        self.h2_num = hidden2_units
        self.input_observations = tf.placeholder(tf.float32, shape=(None,self.input_dim))
        self.action_labels = tf.placeholder(tf.float32, shape=(None,self.action_dim))

        self.w1 = tf.Variable(tf.truncated_normal([self.input_dim, self.h1_num],
                                                  stddev=1.0 / math.sqrt(float(self.input_dim))),name='w1')
        self.b1 = tf.Variable(tf.zeros(self.h1_num),name='b1')
        self.h1 = tf.nn.relu(tf.matmul(self.input_observations,self.w1) + self.b1)

        self.w2 = tf.Variable(tf.truncated_normal([self.h1_num, self.h2_num],
                                             stddev=1.0 / math.sqrt(float(self.h1_num))),name='w2')
        self.b2 = tf.Variable(tf.zeros(self.h2_num),name='b2')
        self.h2 = tf.nn.relu(tf.matmul(self.h1,self.w2) + self.b2)

        self.w3 = tf.Variable(tf.truncated_normal([self.h2_num, self.action_dim],
                                             stddev=1.0 / math.sqrt(float(self.h2_num))),name='w3')
        self.b3 = tf.Variable(tf.zeros(self.action_dim),name='b3')
        self.output = tf.matmul(self.h2,self.w3) + self.b3

        self.error = tf.reduce_mean(tf.pow(tf.subtract(self.output,self.action_labels),2))
        if regularization:
            self.regularizers = tf.nn.l2_loss(self.w1) + tf.nn.l2_loss(self.w2) + tf.nn.l2_loss(self.w3)
            self.error = self.error + beta * self.regularizers

        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.error)

    def train(self, sess, saver, train_data, training_epochs, batch_size):
        """Supervised training of agent network"""
        
        total_batch = int(len(train_data)/batch_size)
        for epoch in xrange(training_epochs):
            batch_count = 0
            avg_cost = 0.
      
            # Loop over all batches
            for i in xrange(total_batch):
                next_batch = random.sample(train_data, batch_size)
                next_batch = zip(*next_batch)
                batch_x = next_batch[0]
                batch_y = np.asarray(next_batch[1])
                batch_y = batch_y.reshape((batch_size,self.action_dim))

                # Run optimization op (backprop) and cost op (to get loss value)
                _, c = sess.run([self.optimizer, self.error], 
                                feed_dict={self.input_observations: batch_x, self.action_labels: batch_y})
                
                # Compute average loss
                avg_cost += c / total_batch
                if i % 10000 == 0:
                    print("Batch number {:d}".format(i))
                    #print("Step {} | Average cost {}".format(i, avg_cost))
            
            # Display logs per epoch step
            print("Epoch: {:04d}, cost = {:.9f}".format(epoch+1, avg_cost))
            
        print "Optimization Finished!"
        saver.save(sess, path + '/' + environment + '.cptk')
        print ("Model Saved")
        
    def run(self, sess, saver, env, num_rollouts, render = False, load_model = True):
        """Run policy on real observations"""
       
        returns = []
        observations = []
        max_step = env.spec.timestep_limit
        
        if load_model:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(path)
            saver.restore(sess,ckpt.model_checkpoint_path)
        else: 
            sess.close()

        for i in range(num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                observations.append(obs)
                obs = obs.reshape((1,self.input_dim))
                obs -= data_mean
                action = sess.run(self.output, feed_dict={self.input_observations: obs})
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps >= max_step:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
        
        self.agent_data = {'observations': np.array(observations)}
        with open("Hopper-v1" + '_agent_data.pickle', 'wb') as handle:
            pickle.dump(self.agent_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print("Agent data pickled successfully")

In [None]:
def main(num_cycles, environment):
    with open("Hopper-v1" + '_expert_data.pickle', 'rb') as handle:
        train_data = pickle.load(handle)
    data_mean, train_data, input_dim, action_dim, data_combined = data_preprocessing(train_data)
    
    with tf.Session() as sess:
        agent = Network(input_dim,action_dim, hidden1_units, hidden2_units)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        
        for cycle in xrange(num_cycles):
            agent.train(sess, saver, data_combined, training_epochs, batch_size)
            agent.run(sess, saver, environment)
            break
            expert_data = expert_run(agent_observations)
            train_data.append(expert_data)
        

In [None]:
# Parameters
learning_rate = 0.001
training_epochs = 1
batch_size = 100
display_step = 1
num_rollouts = 20
beta = 0.001
path = './dagger_policy'

# Network Parameters
hidden1_units = 128 # 1st layer number of features
hidden2_units = 128 # 2nd layer number of features

environments = {1: "Ant-v1", 2: "HalfCheetah-v1", 3: "Hopper-v1", 
                4: "Humanoid-v1", 5: "Reacher-v1", 6: "Walker2d-v1"}
environment = environments[3]
env = gym.make(environment)

tf.reset_default_graph()

main(1,env)