##### Copyright 2019 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Train and serve a TensorFlow model with TensorFlow Serving

> Warning: **This notebook is designed to be run in a Google Colab only**.  It installs packages on the system and requires root access.  If you want to run it in a local Jupyter notebook, please proceed with caution.

Note: You can run this example right now in a Jupyter-style notebook, no setup required!  Just click "Run in Google Colab"

<div class="devsite-table-wrapper"><table class="tfo-notebook-buttons" align="left">
<tr><td><a target="_blank" href="https://www.tensorflow.org/tfx/tutorials/serving/rest_simple">
<img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a></td>
<td><a target="_blank" href="https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/serving/rest_simple.ipynb">
<img src="https://www.tensorflow.org/images/colab_logo_32px.png">Run in Google Colab</a></td>
<td><a target="_blank" href="https://github.com/tensorflow/tfx/blob/master/docs/tutorials/serving/rest_simple.ipynb">
<img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png">View source on GitHub</a></td>
</tr></table></div>

This guide trains a neural network model to classify [images of clothing, like sneakers and shirts](https://github.com/zalandoresearch/fashion-mnist), saves the trained model, and then serves it with [TensorFlow Serving](https://www.tensorflow.org/serving/).  The focus is on TensorFlow Serving, rather than the modeling and training in TensorFlow, so for a complete example which focuses on the modeling and training see the [Basic Classification example](https://github.com/tensorflow/docs/blob/master/site/en/r1/tutorials/keras/basic_classification.ipynb).

This guide uses [tf.keras](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/keras.ipynb), a high-level API to build and train models in TensorFlow.

In [None]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import os
import subprocess
import logging

logging.disable(logging.ERROR)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

print(tf.__version__)

In [None]:
for gpu in tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

## Create your model

### Import the Fashion MNIST dataset

This guide uses the [Fashion MNIST](https://github.com/zalandoresearch/fashion-mnist) dataset which contains 70,000 grayscale images in 10 categories. The images show individual articles of clothing at low resolution (28 by 28 pixels), as seen here:

<table>
  <tr><td>
    <img src="https://tensorflow.org/images/fashion-mnist-sprite.png"
         alt="Fashion MNIST sprite"  width="600">
  </td></tr>
  <tr><td align="center">
    <b>Figure 1.</b> <a href="https://github.com/zalandoresearch/fashion-mnist">Fashion-MNIST samples</a> (by Zalando, MIT License).<br/>&nbsp;
  </td></tr>
</table>

Fashion MNIST is intended as a drop-in replacement for the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset—often used as the "Hello, World" of machine learning programs for computer vision. You can access the Fashion MNIST directly from TensorFlow, just import and load the data.

Note: Although these are really images, they are loaded as NumPy arrays and not binary image objects.

In [None]:
fashion_mnist = keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

# scale the values to 0.0 to 1.0
train_images = train_images / 255.0
test_images = test_images / 255.0

# reshape for feeding into the model
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

print('\ntrain_images.shape: {}, of {}'.format(train_images.shape, train_images.dtype))
print('test_images.shape: {}, of {}'.format(test_images.shape, test_images.dtype))

### Train and evaluate your model

Let's use the simplest possible CNN, since we're not focused on the modeling part.

In [None]:
model = keras.Sequential([
  keras.layers.Conv2D(input_shape=(28,28,1), filters=8, kernel_size=3, 
                      strides=2, activation='relu', name='Conv1'),
  keras.layers.Flatten(),
  keras.layers.Dense(10, activation=tf.nn.softmax, name='Softmax')
])
model.summary()

testing = False
epochs = 5

model.compile(optimizer=tf.optimizers.Adam(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(train_images, train_labels, epochs=epochs)

test_loss, test_acc = model.evaluate(test_images, test_labels)
print('\nTest accuracy: {}'.format(test_acc))

## Save your model

To load our trained model into TensorFlow Serving we first need to save it in [SavedModel](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/saved_model) format.  This will create a protobuf file in a well-defined directory hierarchy, and will include a version number.  [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving) allows us to select which version of a model, or "servable" we want to use when we make inference requests.  Each version will be exported to a different sub-directory under the given path.

In [None]:
# Fetch the Keras session and save the model
# The signature definition is defined by the input and output tensors,
# and stored with the default serving key
import tempfile
import shutil

MODEL_DIR = tempfile.gettempdir()
MODEL_NAME = "my_mnist_model"
VERSION = "0002"
export_path = os.path.join(MODEL_DIR, MODEL_NAME, VERSION)
print('export_path = {}\n'.format(export_path))

if os.path.isdir(export_path):
  print('\nAlready saved a model, cleaning up\n')
  shutil.rmtree(export_path, ignore_errors=True)

model.save(export_path)

print('\nSaved model:')
!ls -l {export_path}

## Examine your saved model

We'll use the command line utility `saved_model_cli` to look at the [MetaGraphDefs](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/MetaGraphDef) (the models) and [SignatureDefs](../signature_defs) (the methods you can call) in our SavedModel.  See [this discussion of the SavedModel CLI](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/saved_model.md#cli-to-inspect-and-execute-savedmodel) in the TensorFlow Guide.

In [None]:
#!saved_model_cli show --dir {export_path} --all

That tells us a lot about our model!  In this case we just trained our model, so we already know the inputs and outputs, but if we didn't this would be important information.  It doesn't tell us everything, like the fact that this is grayscale image data for example, but it's a great start.

## Serve your model with TensorFlow Serving



### Run a TensorFlow Serving containier


Run the following code in a terminal, where `my_model_path` and `my_model_name` are directory and name of my model. The port 8500 is for the gPRC API and the port 8501 is for the REST API.

```BASH
export MODEL_DIR=my_model_path
export MODEL_NAME=my_model_name


docker run -d --rm -p 8500:8500 -p 8501:8501 -v $MODEL_DIR/$MODEL_NAME:/models/$MODEL_NAME   \
   -e MODEL_NAME=$MODEL_NAME tensorflow/serving


```
Don' use the `%%bash` magic cell!

## Make a request to your model in TensorFlow Serving

First, let's take a look at a random example from our test data.

In [None]:
def show(idx, title):
  plt.figure()
  plt.imshow(test_images[idx].reshape(28,28))
  plt.axis('off')
  plt.title('\n\n{}'.format(title), fontdict={'size': 16})

import random
rando = random.randint(0,len(test_images)-1)
show(rando, 'An Example Image: {}'.format(class_names[test_labels[rando]]))

### Using the REST (Representational state transfer) protocol for requests

#### Newest version of the servable

We'll send a predict request as a POST to our server's REST endpoint, and pass it three examples.  We'll ask our server to give us the latest version of our servable by not specifying a particular version.

In [None]:
#!pip install -q requests
import requests

idx = 17 # the index of our first picture, we'll send this and the next two to the server for prediction

headers = {"content-type": "application/json"}
url = f'http://localhost:8501/v1/models/{MODEL_NAME}:predict'
data = {"signature_name": "serving_default", "instances": test_images[idx:idx+3].tolist()}

response = requests.post(url = url, json = data, headers = headers)  # a requests.models.Response object
predictions = response.json()['predictions']    # self.json() is equivalent to json.loads(self.text)

show(idx, 'The model thought this was a {} (class {}), and it was actually a {} (class {})'.format(
  class_names[np.argmax(predictions[0])], test_labels[idx], class_names[np.argmax(predictions[0])], test_labels[idx]))

#### A particular version of the servable

Now let's specify a particular version of our servable.  Since we only have one, let's select version 1.  We'll also look at all three results. We will also make use of the "json" library as an alternative to the (simpler) method above.

In [None]:
import json

url = f'http://localhost:8501/v1/models/{MODEL_NAME}/versions/{VERSION}:predict'
json_data = json.dumps(data)

response = requests.post(url, data=json_data, headers=headers)
predictions = json.loads(response.text)['predictions']

for i in range(0,3):
  show(idx+i, 'The model thought this was a {} (class {}), and it was actually a {} (class {})'.format(
    class_names[np.argmax(predictions[i])], test_labels[idx+i], class_names[np.argmax(predictions[i])], test_labels[idx+i]))

## Using the gRPC (google Remote Procedure Calls) protocol for requests

In [None]:
# Installing the Python tensorflow-serving-api if necessary
#! pip install tensorflow-serving-api

We'll make intensive use of protocol buffers which are represented by their access classes. The most important for us are the PredictRequest and the PredictResponse protos, but there are also Classification(Request/Response) and Regression(Request/Response) protos. Have a look at https://github.com/tensorflow/serving/tree/master/tensorflow_serving/apis. We'll also need the Tensor proto whose description can be found here https://github.com/tensorflow/tensorflow/blob/v2.2.0/tensorflow/core/framework/tensor.proto.


```Python

# PredictRequest specifies which TensorFlow model to run, as well as
# how inputs are mapped to tensors and how outputs are filtered before
# returning to user.
message PredictRequest {
  
    # Model Specification. If version is not specified, will use the latest
    # (numerical) version.
    ModelSpec model_spec = 1;

    # Input tensors.
    # Names of input tensor are alias names. The mapping from aliases to real
    # input tensor names is stored in the SavedModel export as a prediction
    # SignatureDef under the 'inputs' field.
    map<string, TensorProto> inputs = 2;

    # Output filter.
    # Names specified are alias names. The mapping from aliases to real output
    # tensor names is stored in the SavedModel export as a prediction
    # SignatureDef under the 'outputs' field.
    # Only tensors specified here will be run/fetched and returned, with the
    # exception that when none is specified, all tensors specified in the
    # named signature will be run/fetched and returned.
    repeated string output_filter = 3;
}

# Response for PredictRequest on successful run.
message PredictResponse {
  
    # Effective Model Specification used to process PredictRequest.
    ModelSpec model_spec = 2;

    # Output tensors.
    map<string, TensorProto> outputs = 1;
}


```



Here is the definition of the ModelSpec protocol buffer.

```Python
# Metadata for an inference request such as the model name and version.
message ModelSpec {
  
    # Required servable name.
    string name = 1;

    # Optional choice of which version of the model to use.
    # Recommended to be left unset in the common case. Should be specified only
    # when there is a strong version consistency requirement.
    # When left unspecified, the system will serve the best available version.
    # This is typically the latest version, though during version transitions,
    # notably when serving on a fleet of instances, may be either the previous or
    # new version.
    oneof version_choice {
     
        # Use this specific version number.
        google.protobuf.Int64Value version = 2;

        # Use the version associated with the given label.
        string version_label = 4;
    }

    # A named signature to evaluate. If unspecified, the default signature
    # "serving_default" will be used.
    string signature_name = 3;
}
```

In [None]:
import grpc
from tensorflow_serving.apis.predict_pb2 import PredictRequest
from tensorflow_serving.apis.prediction_service_pb2_grpc import PredictionServiceStub



# creating an instance of the access class of the PredictRequest protocol buffer 
request = PredictRequest()

# specifying the "model_spec" argument which is a ModelSpec protocol buffer
request.model_spec.name = MODEL_NAME
request.model_spec.signature_name = "serving_default"

# specifying the "inputs" argument which is a dictionary of Tensor protocol buffers
# we have to include the batch dimension and the data type should be float32
# note that "tf.make_tensor_proto" does not accept tensors but Python or Numpy objects
request.inputs[model.input_names[0]].CopyFrom(tf.make_tensor_proto(test_images[0:1].astype(np.float32)))

print("The PredictRequest protocol buffer:\n")
print(request)




# creating a gRPC communication channel to 'localhost:8500'
channel = grpc.insecure_channel('localhost:8500')

# creating a gRPC service object
predict_service = PredictionServiceStub(channel)

# sending the 'request' using the predict method of the service with a timeout of 10s 
# and getting an instance of the access class of a PredictResponse protocol buffer 
response = predict_service.Predict(request, timeout=10.0)

print("The PredictResponse protocol buffer:\n")
print(response)

# extracting the Tensor protocol buffer representing the first output from the response
output_tensor_proto = response.outputs[model.output_names[0]]

# parsing a ndarray from the Tensor protocol buffer 
# (and not from the Tensor stored in the "tensor_content")
y_proba = tf.make_ndarray(output_tensor_proto)

# inspecting the result
print("The predicted probabilities of the classes:")
y_proba.round(2)