# Hello Image Data

This tutorial demonstrates how to train an image classifier using TensorFlow and the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html).

You should be familiar with TensorFlow before starting this tutorial. If you need a refresher, read TensorFlow's [Convolutional Neural Network](https://www.tensorflow.org/tutorials/images/cnn) tutorial.

## Before you begin

* Install the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). You'll need Ray 1.13 later to run this example.

```
pip instsall 'ray[data,tune]'
```

* Install `tensorflow` and `tensorflow-datasets`

```
pip install tensorflow tensorflow-datasets
```


# Load and normalize CIFAR-10

In [49]:
import ray
from ray.data.datasource import SimpleTensorFlowDatasource
import tensorflow as tf

from tensorflow.keras import layers, models
import tensorflow_datasets as tfds

def train_dataset_factory():
    return tfds.load("cifar10", split=["train"], as_supervised=True)[0]

def test_dataset_factory():
    return tfds.load("cifar10", split=["test"], as_supervised=True)[0]

train_dataset = ray.data.read_datasource(  
    SimpleTensorFlowDatasource(), dataset_factory=train_dataset_factory
)
test_dataset = ray.data.read_datasource(SimpleTensorFlowDatasource(), dataset_factory=test_dataset_factory)




In [50]:
def normalize_images(batch):
    return [(tf.cast(image, tf.float32) / 255.0, label) for image, label in batch]

train_dataset = train_dataset.map_batches(normalize_images)
test_dataset = test_dataset.map_batches(normalize_images)

Read->Map_Batches: 100%|██████████| 1/1 [00:13<00:00, 13.54s/it]
Read->Map_Batches: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


In [51]:
import pandas as pd
from ray.data.extensions import TensorArray


def convert_batch_to_pandas(batch):
    images = TensorArray([image.numpy() for image, _ in batch])
    labels = [label.numpy() for _, label in batch]

    df = pd.DataFrame({"image": images, "label": labels})

    return df
    

train_dataset = train_dataset.map_batches(convert_batch_to_pandas)
test_dataset = test_dataset.map_batches(convert_batch_to_pandas)

test_dataset

Map_Batches:   0%|          | 0/1 [00:04<?, ?it/s][2m[36m(raylet)[0m Spilled 4263 MiB, 32 objects, write throughput 813 MiB/s.
Map_Batches: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]
Map_Batches: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]


Dataset(num_blocks=1, num_rows=10000, schema={image: TensorDtype, label: int64})

## Train a convolutional neural network

In [43]:
def build_model():
    model = models.Sequential()
    def squeeze(input):
        print(input.shape)
        return tf.squeeze(input, axis=1)
    model.add(layers.Lambda(squeeze))
    model.add(layers.Conv2D(6, (5, 5), activation='relu', input_shape=(32, 32, 3)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(16, (5, 5), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(120, activation='relu'))
    model.add(layers.Dense(84, activation='relu'))
    model.add(layers.Dense(10))
    return model

In [44]:
from ray import train
from ray.train.tensorflow import prepare_dataset_shard


# Slower than Torch?

def train_loop_per_worker(config):
    dataset_shard = train.get_dataset_shard("train")
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    with strategy.scope():
        model = build_model()
        model.compile(optimizer='adam',
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
    
    for epoch in range(2):  # TODO: Change to 2 epochs
        tf_dataset = prepare_dataset_shard(
            dataset_shard.to_tf(
                feature_columns=["image"],
                label_column="label",
                output_signature=(
                    tf.TensorSpec(shape=(None, 1, 32, 32, 3), dtype=tf.float32),
                    tf.TensorSpec(shape=(None, 1), dtype=tf.uint8),
                ),
                batch_size=config["batch_size"],
                unsqueeze_label_tensor=True,
            )
        )
        model.fit(tf_dataset)
        train.save_checkpoint(epoch=epoch, model_weights=model.get_weights())

In [45]:
from ray.ml.train.integrations.tensorflow import TensorflowTrainer

trainer = TensorflowTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={"batch_size": 2},
    datasets={"train": train_dataset},
    scaling_config={"num_workers": 2}
)
result = trainer.fit()
latest_checkpoint = result.checkpoint

Trial name,status,loc
TensorflowTrainer_26259_00000,TERMINATED,127.0.0.1:16385


[2m[33m(raylet)[0m 2022-05-21 16:24:24,475	INFO context.py:70 -- Exec'ing worker with command: exec /Users/balaji/GitHub/ray/.venv/bin/python /Users/balaji/GitHub/ray/.venv/lib/python3.8/site-packages/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=52605 --object-store-name=/tmp/ray/session_2022-05-21_16-05-51_439235_13824/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-21_16-05-51_439235_13824/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=64222 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:56352 --redis-password=5241590000000000 --startup-token=46 --runtime-env-hash=1215741992
[2m[33m(raylet)[0m 2022-05-21 16:24:29,576	INFO context.py:70 -- Exec'ing worker with command: exec /Users/balaji/GitHub/ray/.venv/bin/python /Users/balaji/GitHub/ray/.venv/lib/python3.8/site-packages/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port

      1/Unknown - 3s 3s/step - loss: 2.4705 - accuracy: 0.0000e+00
      1/Unknown - 3s 3s/step - loss: 2.4705 - accuracy: 0.0000e+00
     18/Unknown - 3s 6ms/step - loss: 2.3254 - accuracy: 0.0556  
     18/Unknown - 3s 6ms/step - loss: 2.3254 - accuracy: 0.0556  
     35/Unknown - 3s 6ms/step - loss: 2.3116 - accuracy: 0.0571
     35/Unknown - 3s 6ms/step - loss: 2.3116 - accuracy: 0.0571
     50/Unknown - 4s 7ms/step - loss: 2.3165 - accuracy: 0.1000
     50/Unknown - 4s 7ms/step - loss: 2.3165 - accuracy: 0.1000
     67/Unknown - 4s 6ms/step - loss: 2.3178 - accuracy: 0.0896
     67/Unknown - 4s 6ms/step - loss: 2.3178 - accuracy: 0.0896
     82/Unknown - 4s 7ms/step - loss: 2.3154 - accuracy: 0.0854
     82/Unknown - 4s 7ms/step - loss: 2.3154 - accuracy: 0.0854
     99/Unknown - 4s 7ms/step - loss: 2.3130 - accuracy: 0.0909
     99/Unknown - 4s 7ms/step - loss: 2.3130 - accuracy: 0.0909
    122/Unknown - 4s 7ms/step - loss: 2.3131 - accuracy: 0.1066
    122/Unknown - 4s 7ms/step 

[2m[36m(BaseWorkerMixin pid=16398)[0m 2022-05-21 16:27:23.801448: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.
[2m[36m(BaseWorkerMixin pid=16399)[0m 2022-05-21 16:27:23.803250: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


     16/Unknown - 0s 7ms/step - loss: 1.6129 - accuracy: 0.4062 
     16/Unknown - 0s 7ms/step - loss: 1.6129 - accuracy: 0.4062 
     29/Unknown - 0s 7ms/step - loss: 1.5779 - accuracy: 0.4138
     29/Unknown - 0s 7ms/step - loss: 1.5779 - accuracy: 0.4138
     44/Unknown - 0s 7ms/step - loss: 1.5980 - accuracy: 0.4205
     44/Unknown - 0s 7ms/step - loss: 1.5980 - accuracy: 0.4205
     59/Unknown - 1s 7ms/step - loss: 1.5493 - accuracy: 0.4492
     59/Unknown - 1s 7ms/step - loss: 1.5493 - accuracy: 0.4492
     76/Unknown - 1s 7ms/step - loss: 1.5420 - accuracy: 0.4671
     76/Unknown - 1s 7ms/step - loss: 1.5420 - accuracy: 0.4671
     91/Unknown - 1s 7ms/step - loss: 1.5401 - accuracy: 0.4560
     91/Unknown - 1s 7ms/step - loss: 1.5401 - accuracy: 0.4560
    108/Unknown - 1s 7ms/step - loss: 1.5472 - accuracy: 0.4444
    108/Unknown - 1s 7ms/step - loss: 1.5472 - accuracy: 0.4444
    123/Unknown - 1s 7ms/step - loss: 1.5641 - accuracy: 0.4350
    123/Unknown - 1s 7ms/step - loss: 

2022-05-21 16:30:20,885	ERROR checkpoint_manager.py:189 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']


Trial TensorflowTrainer_26259_00000 completed. Last result: 


2022-05-21 16:30:20,998	INFO tune.py:752 -- Total run time: 357.34 seconds (357.20 seconds for the tuning loop).
[2m[36m(BaseWorkerMixin pid=16398)[0m Exception ignored in: <function Pool.__del__ at 0x1baf6dd30>
[2m[36m(BaseWorkerMixin pid=16398)[0m Traceback (most recent call last):
[2m[36m(BaseWorkerMixin pid=16398)[0m   File "/Users/balaji/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/pool.py", line 268, in __del__
[2m[36m(BaseWorkerMixin pid=16398)[0m     self._change_notifier.put(None)
[2m[36m(BaseWorkerMixin pid=16398)[0m   File "/Users/balaji/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/queues.py", line 368, in put
[2m[36m(BaseWorkerMixin pid=16398)[0m     self._writer.send_bytes(obj)
[2m[36m(BaseWorkerMixin pid=16398)[0m   File "/Users/balaji/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
[2m[36m(BaseWorkerMixin pid=16398)[0m     self._send_bytes(m[offset:offset + size])
[2m[36m(BaseWorkerMix

## Test the network on the test data

In [None]:
from ray.ml.predictors.integrations.tensorflow import TensorflowPredictor
from ray.ml.batch_predictor import BatchPredictor
batch_predictor = BatchPredictor.from_checkpoint(
    checkpoint=latest_checkpoint,
    predictor_cls=TensorflowPredictor,
    model=Net(),
)
    
outputs: ray.data.Dataset = batch_predictor.predict(
    data=test_dataset, feature_columns=["image"], unsqueeze=False
)
outputs.show(1)

# Save checkpoint to file?

## What's next

TODO