# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [1]:
import numpy as np
import os
import tensorflow as tf

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

### Hyperparameters

In [2]:
### General parameters
max_steps = 3e5 # Set maximum number of steps to run environment.
run_path = "ppo" # The sub-directory name for model and summary statistics
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.
summary_freq = 10000 # Frequency at which to save training statistics.
save_freq = 50000 # Frequency at which to save model.
env_name = "mytest" # Name of the training environment file.
curriculum_file = "curricula/race.json"

### Algorithm-specific parameters for tuning
gamma = 0.99 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 1e-3 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.
epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 2048 # How large the experience buffer should be before gradient descent.
learning_rate = 1e-4 # Model learning rate.
hidden_units = 64 # Number of units in hidden layer.
batch_size = 64 # How many experiences per gradient descent update step.
normalize = False

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffe_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

### Load the environment

In [3]:
env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file)
print(str(env))
brain_name = env.external_brain_names[0]

INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		road_width -> 4.0
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 7
        Action space type: continuous
        Action space size (per agent): 2
        Memory space size (per agent): 0
        Action descriptions: , 


### Train the Agent(s)

In [4]:
tf.reset_default_graph()

if curriculum_file == "None":
    curriculum_file = None


def get_progress():
    if curriculum_file is not None:
        if env._curriculum.measure_type == "progress":
            return steps / max_steps
        elif env._curriculum.measure_type == "reward":
            return last_reward
        else:
            return None
    else:
        return None

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps, 
                               normalize=normalize, num_layers=num_layers)

is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)

model_path = './models/{}'.format(run_path)
summary_path = './summaries/{}'.format(run_path)

if not os.path.exists(model_path):
    os.makedirs(model_path)

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])    
    summary_writer = tf.summary.FileWriter(summary_path)
    info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
    trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
    if train_model:
        trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)
    while steps <= max_steps:
        if env.global_done:
            prog = get_progress()
            info = env.reset(train_mode=train_model, progress=prog)[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=model_path, steps=steps, saver=saver)
        steps += 1
        sess.run(ppo_model.increment_step)
        if len(trainer.stats['cumulative_reward']) > 0:
            mean_reward = np.mean(trainer.stats['cumulative_reward'])
            sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
            last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(model_path, env_name)

Step: 10000. Mean Reward: -196.20000000000113. Std of Reward: 314.2999999999865.


INFO:unityagents:
Lesson changed. Now in Lesson 1 : 	road_width -> 7.5


Step: 20000. Mean Reward: 1162.3999999999892. Std of Reward: 262.39999999990425.
Step: 30000. Mean Reward: 1455.8999999998305. Std of Reward: 72.90000000004648.
Step: 40000. Mean Reward: 1347.5999999997885. Std of Reward: 45.49999999998636.


INFO:unityagents:
Lesson changed. Now in Lesson 2 : 	road_width -> 7.0


Step: 50000. Mean Reward: 943.2999999998895. Std of Reward: 451.1999999998859.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Saved Model
Step: 60000. Mean Reward: 1446.0499999999004. Std of Reward: 177.45000000009372.
Step: 70000. Mean Reward: 1370.149999999785. Std of Reward: 47.24999999999239.


INFO:unityagents:
Lesson changed. Now in Lesson 3 : 	road_width -> 6.5


Step: 80000. Mean Reward: 1342.5499999997876. Std of Reward: 8.94999999999709.
Step: 90000. Mean Reward: 1434.099999999985. Std of Reward: 186.80000000000882.
Step: 100000. Mean Reward: 1438.999999999777. Std of Reward: 119.59999999997444.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Saved Model


INFO:unityagents:
Lesson changed. Now in Lesson 4 : 	road_width -> 6.0


Step: 110000. Mean Reward: 1053.9499999999196. Std of Reward: 148.24999999990928.
Step: 120000. Mean Reward: 1824.0499999999283. Std of Reward: 32.649999999932675.
Step: 130000. Mean Reward: 1702.1999999998793. Std of Reward: 37.900000000022715.


INFO:unityagents:
Lesson changed. Now in Lesson 5 : 	road_width -> 5.5


Step: 140000. Mean Reward: 895.3499999999328. Std of Reward: 318.7499999999117.
Step: 150000. Mean Reward: 1242.2499999998458. Std of Reward: 71.44999999994741.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Saved Model
Step: 160000. Mean Reward: 1238.3499999998314. Std of Reward: 16.949999999987995.


INFO:unityagents:
Lesson changed. Now in Lesson 6 : 	road_width -> 5.0


Step: 170000. Mean Reward: 1683.2499999998242. Std of Reward: 357.9500000000339.
Step: 180000. Mean Reward: 763.6499999999579. Std of Reward: 1001.149999999957.
Step: 190000. Mean Reward: 576.8000000000637. Std of Reward: 197.50000000000716.


INFO:unityagents:
Lesson changed. Now in Lesson 7 : 	road_width -> 4.5


Step: 200000. Mean Reward: 1525.7499999998495. Std of Reward: 334.0499999999746.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Saved Model
Step: 210000. Mean Reward: 1292.5999999999583. Std of Reward: 559.2999999998863.
Step: 220000. Mean Reward: 1146.9499999999389. Std of Reward: 108.44999999986692.


INFO:unityagents:
Lesson changed. Now in Lesson 8 : 	road_width -> 4.0


Step: 230000. Mean Reward: 2323.849999999904. Std of Reward: 184.74999999999568.
Step: 240000. Mean Reward: 1197.5999999998749. Std of Reward: 25.79999999996903.
Step: 250000. Mean Reward: 846.900000000078. Std of Reward: 131.60000000000537.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Saved Model
Step: 260000. Mean Reward: 1015.8000000000409. Std of Reward: 17.800000000026614.
Step: 270000. Mean Reward: 1326.5999999998635. Std of Reward: 249.9999999998812.
Step: 280000. Mean Reward: 1220.0999999998503. Std of Reward: 35.99999999996203.
Step: 290000. Mean Reward: 1608.3500000000004. Std of Reward: 635.5499999999142.
Step: 300000. Mean Reward: 967.3000000000309. Std of Reward: 126.59999999995256.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Saved Model
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Type is unsupported, or the types of the items don't match field type in CollectionDef.
'TextSummaryPluginAsset' object has no attribute 'name'


Saved Model
INFO:tensorflow:Restoring parameters from ./models/ppo\model-300001.cptk


INFO:tensorflow:Restoring parameters from ./models/ppo\model-300001.cptk


INFO:tensorflow:Froze 7 variables.


INFO:tensorflow:Froze 7 variables.


Converted 7 variables to const ops.
50 ops in the final graph.


### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [5]:
export_graph(model_path, env_name)

INFO:tensorflow:Restoring parameters from ./models/ppo\model-300001.cptk


INFO:tensorflow:Restoring parameters from ./models/ppo\model-300001.cptk


INFO:tensorflow:Froze 7 variables.


INFO:tensorflow:Froze 7 variables.


Converted 7 variables to const ops.
50 ops in the final graph.
