# Hello Image Data

This tutorial demonstrates how to train an image classifier using TensorFlow and the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html).

You should be familiar with TensorFlow before starting this tutorial. If you need a refresher, read TensorFlow's [Convolutional Neural Network](https://www.tensorflow.org/tutorials/images/cnn) tutorial.

## Before you begin

* Install the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). You'll need Ray 1.13 later to run this example.

```
pip instsall 'ray[data,tune]'
```

* Install `tensorflow` and `tensorflow-datasets`

```
pip install tensorflow tensorflow-datasets
```


# Load and normalize CIFAR-10

In [150]:
import ray
from ray.data.datasource import SimpleTensorFlowDatasource
import tensorflow as tf

from tensorflow.keras import layers, models
import tensorflow_datasets as tfds


def train_dataset_factory():
    return tfds.load("cifar10", split=["train"], as_supervised=True)[0]

def test_dataset_factory():
    return tfds.load("cifar10", split=["test"], as_supervised=True)[0]

train_dataset = ray.data.read_datasource(  
    SimpleTensorFlowDatasource(), dataset_factory=train_dataset_factory
)
test_dataset = ray.data.read_datasource(SimpleTensorFlowDatasource(), dataset_factory=test_dataset_factory)




In [158]:
def normalize_images(batch):
    return [(tf.cast(image, tf.float32) / 255.0, label) for image, label in batch]

# train_dataset = train_dataset.map_batches(normalize_images)
# test_dataset = test_dataset.map_batches(normalize_images)

from timeit import default_timer as timer

def f(dataset):
    print("Starting")
    start = timer()
    dataset.map_batches(normalize_images)
    print(timer() - start)

f(train_dataset)  # 1.8523165209999206 / 15.950762251000015

Starting


Read->Map_Batches: 100%|██████████| 1/1 [00:15<00:00, 15.94s/it]

15.950762251000015





In [160]:
asdf

8.611250868933269

In [51]:
import pandas as pd
from ray.data.extensions import TensorArray


```
train_dataset = torchvision.datasets.cifar10(download=True, transform=transform)
train_dataset = ray.data.read_torch(train_dataset, column_names=["image", "label"])
train_dataset

Dataset(num_blocks=1000, num_rows=10000, schema={image: TensorDtype, label: int64})
````

```
def train_dataset_factory():
    return tfds.load("cifar10", split=["train"], as_supervised=True)[0]

train_dataset = ray.data.read_datasource(  
    SimpleTensorFlowDatasource(), dataset_factory=train_dataset_factory
)

def convert_batch_to_pandas(batch):
    images = TensorArray([image.numpy() for image, _ in batch])
    labels = [label.numpy() for _, label in batch]

    df = pd.DataFrame({"image": images, "label": labels})

    return df
    

train_dataset = train_dataset.map_batches(convert_batch_to_pandas)
test_dataset = test_dataset.map_batches(convert_batch_to_pandas)

test_dataset

Dataset(num_blocks=1, num_rows=10000, schema={image: TensorDtype, label: int64})

```

def convert_batch_to_pandas(batch):
    images = TensorArray([image.numpy() for image, _ in batch])
    labels = [label.numpy() for _, label in batch]

    df = pd.DataFrame({"image": images, "label": labels})

    return df
    

train_dataset = train_dataset.map_batches(convert_batch_to_pandas)
test_dataset = test_dataset.map_batches(convert_batch_to_pandas)

test_dataset

Map_Batches:   0%|          | 0/1 [00:04<?, ?it/s][2m[36m(raylet)[0m Spilled 4263 MiB, 32 objects, write throughput 813 MiB/s.
Map_Batches: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]
Map_Batches: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]


Dataset(num_blocks=1, num_rows=10000, schema={image: TensorDtype, label: int64})

## Train a convolutional neural network

In [52]:
def build_model():
    model = models.Sequential()
    def squeeze(input):
        print(input.shape)
        return tf.squeeze(input, axis=1)
    model.add(layers.Lambda(squeeze))
    model.add(layers.Conv2D(6, (5, 5), activation='relu', input_shape=(32, 32, 3)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(16, (5, 5), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(120, activation='relu'))
    model.add(layers.Dense(84, activation='relu'))
    model.add(layers.Dense(10))
    return model

In [58]:
from ray import train
from ray.train.tensorflow import prepare_dataset_shard


# Slower than Torch?

def train_loop_per_worker(config):
    dataset_shard = train.get_dataset_shard("train")
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    with strategy.scope():
        model = build_model()
        model.compile(optimizer='adam',
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
    
    for epoch in range(2):  # TODO: Change to 2 epochs
        tf_dataset = prepare_dataset_shard(
            dataset_shard.to_tf(
                feature_columns=["image"],
                label_column="label",
                output_signature=(
                    tf.TensorSpec(shape=(None, 1, 32, 32, 3), dtype=tf.float32),
                    tf.TensorSpec(shape=(None, 1), dtype=tf.uint8),
                ),
                batch_size=config["batch_size"],
                unsqueeze_label_tensor=True,
            )
        )
        model.fit(tf_dataset)
        train.save_checkpoint(epoch=epoch, model=model.get_weights())  # model_weights vs model confusing, in example

In [59]:
from ray.ml.train.integrations.tensorflow import TensorflowTrainer

trainer = TensorflowTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={"batch_size": 2},
    datasets={"train": train_dataset},
    scaling_config={"num_workers": 2}
)
result = trainer.fit()
latest_checkpoint = result.checkpoint

Trial name,status,loc
TensorflowTrainer_25727_00000,TERMINATED,127.0.0.1:18850


[2m[33m(raylet)[0m 2022-05-21 18:33:14,184	INFO context.py:70 -- Exec'ing worker with command: exec /Users/balaji/GitHub/ray/.venv/bin/python /Users/balaji/GitHub/ray/.venv/lib/python3.8/site-packages/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=52605 --object-store-name=/tmp/ray/session_2022-05-21_16-05-51_439235_13824/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-21_16-05-51_439235_13824/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=64222 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:56352 --redis-password=5241590000000000 --startup-token=59 --runtime-env-hash=1215741992
[2m[33m(raylet)[0m 2022-05-21 18:33:19,263	INFO context.py:70 -- Exec'ing worker with command: exec /Users/balaji/GitHub/ray/.venv/bin/python /Users/balaji/GitHub/ray/.venv/lib/python3.8/site-packages/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port

[2m[36m(BaseWorkerMixin pid=18854)[0m (None, 1, 32, 32, 3)
[2m[36m(BaseWorkerMixin pid=18855)[0m (None, 1, 32, 32, 3)
[2m[36m(BaseWorkerMixin pid=18854)[0m (None, 1, 32, 32, 3)
[2m[36m(BaseWorkerMixin pid=18855)[0m (None, 1, 32, 32, 3)
[2m[36m(BaseWorkerMixin pid=18855)[0m (None, 1, 32, 32, 3)
[2m[36m(BaseWorkerMixin pid=18854)[0m (None, 1, 32, 32, 3)
      1/Unknown - 2s 2s/step - loss: 2.6434 - accuracy: 0.0000e+00
      1/Unknown - 2s 2s/step - loss: 2.6434 - accuracy: 0.0000e+00
     17/Unknown - 2s 6ms/step - loss: 2.3814 - accuracy: 0.0882  
     17/Unknown - 2s 6ms/step - loss: 2.3814 - accuracy: 0.0882  
     33/Unknown - 2s 6ms/step - loss: 2.3450 - accuracy: 0.0909
     33/Unknown - 2s 6ms/step - loss: 2.3450 - accuracy: 0.0909
     41/Unknown - 2s 6ms/step - loss: 2.3465 - accuracy: 0.0976
     41/Unknown - 2s 6ms/step - loss: 2.3465 - accuracy: 0.0976
     56/Unknown - 2s 7ms/step - loss: 2.3370 - accuracy: 0.1071
     56/Unknown - 2s 7ms/step - loss: 2.33

[2m[36m(BaseWorkerMixin pid=18854)[0m 2022-05-21 18:36:29.559724: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.
[2m[36m(BaseWorkerMixin pid=18855)[0m 2022-05-21 18:36:29.551967: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


     19/Unknown - 0s 9ms/step - loss: 1.7495 - accuracy: 0.3947
     19/Unknown - 0s 9ms/step - loss: 1.7495 - accuracy: 0.3947
     33/Unknown - 0s 8ms/step - loss: 1.6583 - accuracy: 0.4394
     33/Unknown - 0s 8ms/step - loss: 1.6583 - accuracy: 0.4394
     50/Unknown - 0s 8ms/step - loss: 1.6463 - accuracy: 0.4500
     50/Unknown - 0s 8ms/step - loss: 1.6463 - accuracy: 0.4500
     67/Unknown - 1s 7ms/step - loss: 1.6312 - accuracy: 0.4478
     67/Unknown - 1s 7ms/step - loss: 1.6312 - accuracy: 0.4478
     85/Unknown - 1s 7ms/step - loss: 1.6334 - accuracy: 0.4471
     85/Unknown - 1s 7ms/step - loss: 1.6334 - accuracy: 0.4471
    100/Unknown - 1s 7ms/step - loss: 1.6021 - accuracy: 0.4500
    100/Unknown - 1s 7ms/step - loss: 1.6021 - accuracy: 0.4500
    117/Unknown - 1s 7ms/step - loss: 1.5979 - accuracy: 0.4359
    117/Unknown - 1s 7ms/step - loss: 1.5979 - accuracy: 0.4359
    141/Unknown - 1s 7ms/step - loss: 1.6000 - accuracy: 0.4291
    141/Unknown - 1s 7ms/step - loss: 1.

2022-05-21 18:39:36,920	ERROR checkpoint_manager.py:189 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']


Trial TensorflowTrainer_25727_00000 completed. Last result: 


[2m[36m(BaseWorkerMixin pid=18855)[0m E0521 18:39:36.911871000 123145438601216 chttp2_transport.cc:1132]     Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"
2022-05-21 18:39:37,034	INFO tune.py:752 -- Total run time: 383.61 seconds (383.47 seconds for the tuning loop).
[2m[36m(BaseWorkerMixin pid=18855)[0m Exception ignored in: <function Pool.__del__ at 0x1c5fdfd30>
[2m[36m(BaseWorkerMixin pid=18855)[0m Traceback (most recent call last):
[2m[36m(BaseWorkerMixin pid=18855)[0m   File "/Users/balaji/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/pool.py", line 268, in __del__
[2m[36m(BaseWorkerMixin pid=18855)[0m     self._change_notifier.put(None)
[2m[36m(BaseWorkerMixin pid=18855)[0m   File "/Users/balaji/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/queues.py", line 368, in put
[2m[36m(BaseWorkerMixin pid=18855)[0m     self._writer.send_bytes(obj)
[2m[36m(BaseWorkerMixin pid=18855)[0m   File "/Users/balaj

## Test the network on the test data

In [111]:
from ray.ml.preprocessor import Preprocessor
from ray.ml.predictors.integrations.tensorflow import TensorflowPredictor
from ray.ml.batch_predictor import BatchPredictor
batch_predictor = BatchPredictor.from_checkpoint(
    checkpoint=latest_checkpoint,
    predictor_cls=TensorflowPredictor,
    model_definition=build_model,
)

    
outputs: ray.data.Dataset = batch_predictor.predict(
    data=test_dataset, feature_columns=["image"])
outputs.show(1)

# Save checkpoint to file?


mydf = None



Map Progress (1 actors 1 pending):   0%|          | 0/1 [00:04<?, ?it/s][2m[36m(BlockWorker pid=25353)[0m 2022-05-21 19:26:47.045642: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
[2m[36m(BlockWorker pid=25353)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2m[36m(BlockWorker pid=25353)[0m (None, 1, 32, 32, 3)
[2m[36m(BlockWorker pid=25353)[0m (4096, 1, 32, 32, 3)
[2m[36m(BlockWorker pid=25353)[0m (None, 1, 32, 32, 3)
[2m[36m(BlockWorker pid=25353)[0m (4096, 1, 32, 32, 3)


Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:05<00:00,  5.05s/it]

[2m[36m(BlockWorker pid=25353)[0m (None, 1, 32, 32, 3)
{'predictions': array([-0.78461206, -5.066856  , -0.20851843,  0.3337581 ,  0.7118397 ,
        0.2990406 , -1.5815643 ,  0.49869645, -2.6954265 , -3.519926  ],
      dtype=float32)}





[2m[36m(BlockWorker pid=25353)[0m (1808, 1, 32, 32, 3)


In [112]:
import numpy as np
def convert_logits_to_classes(df):
    best_class = df["predictions"].map(lambda x: x.argmax())
    df["prediction"] = best_class
    return df[["prediction"]]
predictions = outputs.map_batches(
    convert_logits_to_classes, batch_format="pandas"
)

Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 23.27it/s]


In [113]:
def calculate_prediction_scores(df):
    df["correct"] = df["prediction"] == df["label"]
    return df[["prediction", "label", "correct"]]
scores = test_dataset.zip(predictions).map_batches(calculate_prediction_scores)
scores.show(1)

Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 16.36it/s]

{'prediction': 4, 'label': 7, 'correct': False}





In [114]:
scores.sum(on="correct") / scores.count()


Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 47.96it/s]
Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 93.84it/s]


0.4722

In [110]:
test_dataset.count()

10000

In [79]:
df = next(train_dataset.iter_batches(batch_size=4096, batch_format="pandas"))

In [86]:
import numpy as np
values = np.stack([col.to_numpy() for _, col in df.items()], axis=1)

ValueError: all input arrays must have the same shape

In [99]:
model = build_model()

2022-05-21 19:16:41.695390: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [106]:
type(x)

NoneType

In [148]:
from ray import serve
from ray.serve.model_wrappers import ModelWrapperDeployment
from ray.serve.http_adapters import json_to_ndarray, NdArray

def my_adapter(data: NdArray):
    print(data)
    assert False
    return np.array(data.array)
serve.start(detached=True)
deployment = ModelWrapperDeployment.options(name="my-deployment2")
deployment.deploy(TensorflowPredictor, latest_checkpoint, http_adapter=my_adapter, batching_params=False, model_definition=build_model)

[2m[36m(ServeController pid=26527)[0m INFO 2022-05-21 19:54:15,056 controller 26527 deployment_state.py:1175 - Stopping 1 replicas of deployment 'my-deployment2' with outdated versions.
[2m[36m(ServeController pid=26527)[0m INFO 2022-05-21 19:54:17,236 controller 26527 deployment_state.py:1216 - Adding 1 replicas to deployment 'my-deployment2'.


In [139]:
batch = test_dataset.take(1)
array = np.expand_dims(np.array(batch[0]["image"]), axis=[0, 1])
array.dtype

dtype('float32')

In [149]:
import requests
payload = {"array": array.tolist()}
response = requests.post(deployment.url, json=payload)
response.json()

[2m[36m(my-deployment2 pid=28394)[0m INFO 2022-05-21 19:54:24,374 my-deployment2 my-deployment2#SHGdCB replica.py:478 - HANDLE __call__ OK 0.3ms
[2m[36m(HTTPProxyActor pid=26529)[0m INFO 2022-05-21 19:54:24,375 http_proxy 127.0.0.1 http_proxy.py:315 - POST /my-deployment2 307 4.4ms
[2m[36m(HTTPProxyActor pid=26529)[0m INFO 2022-05-21 19:54:24,432 http_proxy 127.0.0.1 http_proxy.py:315 - POST /my-deployment2 422 50.7ms
[2m[36m(my-deployment2 pid=28394)[0m INFO 2022-05-21 19:54:24,430 my-deployment2 my-deployment2#SHGdCB replica.py:478 - HANDLE __call__ OK 47.2ms


{'detail': [{'loc': ['body', 'array', 0],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0, 0],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0, 1],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0, 2],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0, 3],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0, 4],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0, 5],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc': ['body', 'array', 0, 0, 6],
   'msg': 'value is not a valid float',
   'type': 'type_error.float'},
  {'loc':

## What's next

TODO