In [1]:
% matplotlib inline

import keras
import pickle
import tensorflow as tf
import numpy as np
import gym
import seaborn as sns
import matplotlib.pyplot as plt

import load_policy

Using TensorFlow backend.


In [2]:
session = tf.InteractiveSession()
keras.backend.set_session(session)

In [3]:
def run(envname, policy, num_rollouts=20, render=False, max_timesteps=None, debug=False):
    tf.global_variables_initializer()
    
    env = gym.make(envname)
    max_steps = max_timesteps or env.spec.timestep_limit

    returns = []
    observations = []
    actions = []

    for i in range(num_rollouts):
        if debug: print('iter', i)

        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0

        while not done:
            action = policy(obs[None,:])
            observations.append(obs)
            actions.append(action.flatten())
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1

            if render: env.render()
            if debug and steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps: break

        returns.append(totalr)

    # print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

    return np.array(observations), np.array(actions)
    
def run_expert(envname, **args):
    policy = load_policy.load_policy('./experts/{}.pkl'.format(envname))
    
    return run(envname, policy, **args)

In [4]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization

def create_model(num_inputs, num_outputs):
    model = Sequential()
    model.add(Dense(128, input_shape=(num_inputs,), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(num_outputs))
    model.compile(loss='mean_squared_error', optimizer='Adam')
    
    return model

In [7]:
def simple_imitation_learning(envname, num_simulation_rollouts=50, plot_loss=False):
    print('Running expert simulation')
    observations, actions = run_expert(envname, num_rollouts=num_simulation_rollouts)
    
    print('Building model')
    model = create_model(observations.shape[1], actions.shape[1])
    history = model.fit(observations, actions, batch_size=128, epochs=10, verbose=0)
    
    print('Running the policy')
    run(envname, model.predict, render=True, num_rollouts=10)
    
    if plot_loss: plt.plot(history.history['loss'])

In [8]:
simple_imitation_learning('Hopper-v1')

Running expert simulation
obs (1, 11) (1, 11)
mean return 3777.98512749
std of return 3.72076657895
Building model
Running the policy
mean return 1346.4365135
std of return 503.577265202


In [9]:
simple_imitation_learning('Humanoid-v1')

Running expert simulation
obs (1, 376) (1, 376)
mean return 10399.9780269
std of return 55.0311672545
Building model
Running the policy
mean return 935.561437185
std of return 725.377516977


In [44]:
def dagger(envname, num_iter=10):
    print('Running expert simulation')
    observations, actions = run_expert(envname, num_rollouts=20)
    expert_policy = load_policy.load_policy('./experts/{}.pkl'.format(envname))
    
    for i in range(0, num_iter):
        print('Iteration #', i+1)
        # Building the model
        model = create_model(observations.shape[1], actions.shape[1])
        model.fit(observations, actions, batch_size=128, epochs=num_iter-i, verbose=0)
        
        # Obtaining new dataset
        new_observations, _ = run(envname, model.predict, num_rollouts=50)
        
        # Getting right labels for the dataset
        new_actions = np.array([expert_policy(obs[None, :]).flatten() for obs in new_observations])
        
        observations = np.concatenate((observations, new_observations))
        actions = np.concatenate((actions, new_actions))

        
    # Finally, let's test our model
    model = create_model(observations.shape[1], actions.shape[1])
    model.fit(observations, actions, batch_size=128, epochs=10, validation_split=0.2)
    run(envname, model.predict, render=True, num_rollouts=30)
    
    return model

In [48]:
model = dagger('Humanoid-v1')

Running expert simulation
mean return 10386.7186736
std of return 109.253803277
Iteration # 1
mean return 454.091469673
std of return 97.281529556
Iteration # 2
mean return 632.852397323
std of return 362.913139849
Iteration # 3
mean return 575.401110447
std of return 243.882086636
Iteration # 4
mean return 1807.5480905
std of return 1029.39536148
Iteration # 5
mean return 1299.58968908
std of return 580.708028161
Iteration # 6
mean return 1295.13438941
std of return 595.231154411
Iteration # 7
mean return 1779.62857762
std of return 719.328774714
Iteration # 8
mean return 1847.4165782
std of return 1436.05799611
Iteration # 9
mean return 1132.35650505
std of return 591.357102148
Iteration # 10
mean return 661.784774391
std of return 207.905808332
Train on 78244 samples, validate on 19561 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
mean return 9390.28353699
std of return 2148.16993137
