![SSD VGG NET](https://cdn-images-1.medium.com/max/1200/1*pPxrkm4Urz04Ez65mwWE9Q.png)

In [1]:
%matplotlib inline

import tensorflow as tf
import numpy as np
import cv2
import os

tf.__version__

'1.12.0'

In [2]:
def get_sample_img():
    imgMean = np.array([104, 117, 124], np.float)
    img = cv2.imread("TestImage.jpg")
    img = cv2.resize(img.astype(float), (300, 300)) #resize
    img -= imgMean #subtract image mean
    img = img.reshape((1, 300, 300, 3))
    return img

In [3]:
def freeze_graph(model_dir, output_node_names):
    """Extract the sub graph defined by the output nodes and convert 
    all its variables into constant 
    Args:
        model_dir: the root folder containing the checkpoint state file
        output_node_names: a string, containing all the output node's names, 
                            comma separated
    """
    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            "directory: %s" % model_dir)

    if not output_node_names:
        print("You need to supply the name of a node to --output_node_names.")
        return -1

    # We retrieve our checkpoint fullpath
    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path
    
    # We precise the file fullname of our freezed graph
    absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + "/frozen_model.pb"

    # We clear devices to allow TensorFlow to control on which device it will load operations
    clear_devices = True

    # We start a session using a temporary fresh Graph
    with tf.Session(graph=tf.Graph()) as sess:
        # We import the meta graph in the current default Graph
        saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)

        # We restore the weights
        saver.restore(sess, input_checkpoint)

        # We use a built-in TF helper to export variables to constants
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess, # The session is used to retrieve the weights
            tf.get_default_graph().as_graph_def(), # The graph_def is used to retrieve the nodes 
            output_node_names.split(",") # The output node names are used to select the usefull nodes
        ) 

        # Finally we serialize and dump the output graph to the filesystem
        with tf.gfile.GFile(output_graph, "wb") as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))

    return output_graph_def

## Creating the model class

In [4]:
class SSDVGG300(object):
    """SSD VGG 300 model"""
    def __init__(self, num_classes):
        dropout_keep_prob=0.5
        is_training=False
        paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]
        feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
        anchor_size_bounds=[0.15, 0.90]
        anchor_sizes=[(21., 45.),
                      (45., 99.),
                      (99., 153.),
                      (153., 207.),
                      (207., 261.),
                      (261., 315.)]
        anchor_ratios=[[2, .5],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5],
                       [2, .5]]
        anchor_steps=[8, 16, 32, 64, 100, 300]
        anchor_offset=0.5
        normalizations=[20, -1, -1, -1, -1, -1]
        prior_scaling=[0.1, 0.1, 0.2, 0.2]
        features = []
        
        self.x = tf.placeholder(tf.float32, [None, 300, 300, 3])
        net = self.x
        
        net = tf.layers.conv2d(net, filters=64, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv1_1')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=64, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv1_2')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.max_pooling2d(net, pool_size=(2, 2), strides=(1, 1), padding='SAME', name='pool1')

        # conv2
        net = tf.layers.conv2d(net, filters=128, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv2_1')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=128, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv2_2')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.max_pooling2d(net, pool_size=(2, 2), strides=(1, 1), padding='SAME', name='pool2')

        # conv3
        net = tf.layers.conv2d(net, filters=256, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv3_1')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=256, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv3_2')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=256, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv3_3')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.max_pooling2d(net, pool_size=(2, 2), strides=(1, 1), padding='SAME', name='pool3')

        # conv4
        net = tf.layers.conv2d(net, filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv4_1')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv4_2')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv4_3')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)
        features.append(net)
        
        net = tf.layers.max_pooling2d(net, pool_size=(2, 2), strides=(1, 1), padding='SAME', name='pool4')

        # conv5
        net = tf.layers.conv2d(net, filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv5_1')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv5_2')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)

        net = tf.layers.conv2d(net, filters=512, kernel_size=(3, 3), strides=(1, 1), padding='SAME', name='conv5_3')
        #net = tf.layers.batch_normalization(net, training=is_training)
        net = tf.nn.relu(net)
        
        net = tf.layers.max_pooling2d(net, pool_size=(3, 3), strides=1, name='pool5')
        
        # Additional SSD blocks.
        # Block 6: let's dilate the hell out of it!
        #net = tf.layers.conv2d(net, filters=1024, kernel_size=(3, 3), dilation_rate=6, name='conv6')
        net = tf.layers.conv2d(net, filters=1024, kernel_size=(3, 3), name='conv6')
        net = tf.layers.dropout(inputs=net, rate=dropout_keep_prob, training=is_training)
        
        # Block 7: 1x1 conv
        net = tf.layers.conv2d(net, filters=1024, kernel_size=(1, 1), name='conv7')
        features.append(net)
        net = tf.layers.dropout(inputs=net, rate=dropout_keep_prob, training=is_training)
        
        # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
        net = tf.layers.conv2d(net, filters=256, kernel_size=(1, 1), name='conv1x1_1')
        net = tf.pad(net, paddings, "CONSTANT")       
        net = tf.layers.conv2d(net, filters=512, kernel_size=(3, 3), strides=2, padding='VALID', name='conv3x3_1')
        features.append(net)
        
        net = tf.layers.conv2d(net, filters=128, kernel_size=(1, 1), name='conv1x1_2')
        net = tf.pad(net, paddings, "CONSTANT")
        net = tf.layers.conv2d(net, filters=256, kernel_size=(3, 3), strides=2, padding='VALID', name='conv3x3_2')
        features.append(net)
        
        
        net = tf.layers.conv2d(net, filters=128, kernel_size=(1, 1), name='conv1x1_3') 
        net = tf.layers.conv2d(net, filters=256, kernel_size=(3, 3), padding='VALID', name='conv3x3_3')
        features.append(net)
        
        net = tf.layers.conv2d(net, filters=128, kernel_size=(1, 1), name='conv1x1_4') 
        net = tf.layers.conv2d(net, filters=256, kernel_size=(3, 3), padding='VALID', name='conv3x3_4')
        features.append(net)
        
        self.predictions, self.logits, self.localisations = [],[],[]
        for i, layer in enumerate(features):
            num_anchors = len(anchor_sizes[i]) + len(anchor_ratios[i])

            # Location.
            num_loc_pred = num_anchors * 4
            loc_pred = tf.layers.conv2d(net, filters=num_loc_pred, kernel_size=(3, 3), name='conv_loc_%d' % i) 
            loc_new_shape = loc_pred.get_shape().as_list()[:-1]+[num_anchors, 4]
            loc_new_shape[0] = -1
            loc_pred = tf.reshape(loc_pred, loc_new_shape, name='location_%d' %i )
            
            # Class prediction.
            num_cls_pred = num_anchors * num_classes
            cls_pred = tf.layers.conv2d(net, filters=num_cls_pred, kernel_size=(3, 3), name='conv_cls_%d' % i)
            cls_new_shape = cls_pred.get_shape().as_list()[:-1]+[num_anchors, num_classes]
            cls_new_shape[0] = -1
            cls_pred = tf.reshape(cls_pred, cls_new_shape, name='conf_%d' %i)
        
            self.predictions.append(tf.nn.softmax(cls_pred, name='class_%d' % i))
            self.logits.append(cls_pred)
            self.localisations.append(loc_pred)

In [5]:
net = SSDVGG300(17)

In [6]:
outputs = {}
names = []
for i in range(len(net.predictions)):
    outputs[net.predictions[i].name] = net.predictions[i]
    outputs[net.logits[i].name] = net.logits[i]
    outputs[net.localisations[i].name] = net.localisations[i]
    names.append(net.predictions[i].name[:-2])
    names.append(net.logits[i].name[:-2])
    names.append(net.localisations[i].name[:-2])

## Saving/exporting the model

In [6]:
!rm -rf model/
exporter = tf.saved_model.builder.SavedModelBuilder('model')
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
 
    exporter.add_meta_graph_and_variables(
        sess, 
        tags=[tf.saved_model.tag_constants.SERVING], 
        signature_def_map={
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            tf.saved_model.signature_def_utils.predict_signature_def(
                inputs={"inputs": net.x}, 
                outputs=outputs
            )
        },
        strip_default_attrs=True)
    #exporter.save()
    saver.save(sess, '/home/ec2-user/SageMaker/GTC2019/model/model')

INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.


In [7]:
_ = freeze_graph('/home/ec2-user/SageMaker/GTC2019/model', ','.join(names))

INFO:tensorflow:Restoring parameters from /home/ec2-user/SageMaker/GTC2019/model/model
INFO:tensorflow:Froze 70 variables.
INFO:tensorflow:Converted 70 variables to const ops.
265 ops in the final graph.


In [8]:
!rm -f model.tar.gz && cd model && tar -czvf ../model.tar.gz frozen_model.pb

frozen_model.pb


In [17]:
import tensorflow as tf

img = get_sample_img()

tf.reset_default_graph()
with tf.gfile.GFile('model/frozen_model.pb', "rb") as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())

with tf.Graph().as_default() as graph:
    # The name var will prefix every op/nodes in your graph
    # Since we load everything in a new graph, this is not needed
    tf.import_graph_def(graph_def, name="prefix")
    
    graph=tf.get_default_graph()

In [20]:
%%time
with tf.Session(graph=graph) as sess:
    x = graph.get_tensor_by_name("prefix/Placeholder:0")
    y = [graph.get_tensor_by_name("prefix/%s:0" % n) for n in names]
    out = sess.run(y, feed_dict = {x: img})


CPU times: user 1.95 s, sys: 683 ms, total: 2.64 s
Wall time: 2.38 s


# Compiling with NEO

In [9]:
import time
import sagemaker
import os
import json
import boto3

# Retrieve the default bucket
sagemaker_session = sagemaker.Session()
default_bucket = sagemaker_session.default_bucket()

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-715445047862


In [13]:
role='arn:aws:iam::715445047862:role/MachineLearningHost'
job_prefix='SSDVGG300'
path='neo/%s' % job_prefix

sm = boto3.client('sagemaker')
!aws s3 cp model.tar.gz s3://$default_bucket/$path/model.tar.gz

upload: ./model.tar.gz to s3://sagemaker-us-east-1-715445047862/neo/SSDVGG300/model.tar.gz


In [15]:
job_name="%s-%d" % (job_prefix, int(time.time()))
sm.create_compilation_job(
    CompilationJobName=job_name,
    RoleArn=role,
    InputConfig={
        'S3Uri': "s3://%s/%s/model.tar.gz" % (default_bucket, path),
        'DataInputConfig': '{"data":[1,300,300,3]}',
        'Framework': 'TENSORFLOW'
    },
    OutputConfig={
        'S3OutputLocation': "s3://%s/%s/" % (default_bucket, path),
        'TargetDevice': 'ml_c5' #'ml_m4'|'ml_m5'|'ml_c4'|'ml_c5'|'ml_p2'|'ml_p3'|'jetson_tx1'|'jetson_tx2'|'rasp3b'|'deeplens'
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 300
    }
)

{u'CompilationJobArn': u'arn:aws:sagemaker:us-east-1:715445047862:compilation-job/SSDVGG300-1552519318',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '101',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 13 Mar 2019 23:21:58 GMT',
   'x-amzn-requestid': '92ae9cc1-4a0c-4857-a860-d43095ba9368'},
  'HTTPStatusCode': 200,
  'RequestId': '92ae9cc1-4a0c-4857-a860-d43095ba9368',
  'RetryAttempts': 0}}

# NEO

### Install DLR if needed

In [None]:
%%bash 

git clone --recursive https://github.com/neo-ai/neo-ai-dlr
cd neo-ai-dlr 
mkdir -p build && cd build && cmake3 .. -DUSE_CUDA=ON -DUSE_CUDNN=ON && make -j
cd ../python && python3 setup.py install --user && python setup.py install --user

## Download the compiled model and run it

In [7]:
%%bash

aws s3 cp s3://sagemaker-us-east-1-715445047862/neo/SSDVGG300/model-ml_c5.tar.gz .
rm -rf neo_test && mkdir neo_test
tar -xzvf model-ml_c5.tar.gz -C neo_test
rm -f model-ml_c5.tar.gz

Completed 256.0 KiB/80.2 MiB (1.5 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/80.2 MiB (3.0 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/80.2 MiB (4.4 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/80.2 MiB (5.6 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/80.2 MiB (6.7 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/80.2 MiB (7.9 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/80.2 MiB (9.2 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/80.2 MiB (10.4 MiB/s) with 1 file(s) remaining Completed 2.2 MiB/80.2 MiB (11.7 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/80.2 MiB (12.8 MiB/s) with 1 file(s) remaining Completed 2.8 MiB/80.2 MiB (13.9 MiB/s) with 1 file(s) remaining Completed 3.0 MiB/80.2 MiB (15.1 MiB/s) with 1 file(s) remaining Completed 3.2 MiB/80.2 MiB (16.0 MiB/s) with 1 file(s) remaining Completed 3.5 MiB/80.2 MiB (17.3 MiB/s) with 1 file(s) remaining Completed 3.8 MiB/80.2 MiB (18.4 MiB/s) with 1 file(s) remaining Completed 

## Finally, load the compiled model using DLR and do the prediction

In [8]:
from dlr import DLRModel

img = get_sample_img()

device = 'cpu'                           # Go, Raspberry Pi, go!
model = DLRModel('neo_test', dev_type=device)
print(model.get_input_names())

[u'Placeholder']


In [None]:
%%time

input_data = {'Placeholder': img}
out = model.run(input_data)