# Hello Image Data

This tutorial illustrates how to train an image classifier using the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html).

You should be familiar with [PyTorch](https://pytorch.org/) before starting the tutorial. If you need a refresher, read PyTorch's [training a classifier](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html) tutorial.

## Before you begin

* Install the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). You'll need Ray 1.13 later to run this example.

```
pip instsall 'ray[data,tune]'
```

* Install `torch` and `torchvision`

```
pip install torch torchvision
```


## Load and normalize CIFAR-10

In [2]:

import ray
from ray.data.datasource import SimpleTorchDatasource
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

def train_dataset_factory():
    return torchvision.datasets.CIFAR10(root="./data", download=True, train=True, transform=transform)

def test_dataset_factory():
    return torchvision.datasets.CIFAR10(root="./data", download=True, train=False, transform=transform)

train_dataset = ray.data.read_datasource(SimpleTorchDatasource(), dataset_factory=train_dataset_factory)
test_dataset = ray.data.read_datasource(SimpleTorchDatasource(), dataset_factory=test_dataset_factory)

train_dataset

2022-05-09 14:02:17,195	INFO services.py:1478 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(_execute_read_task pid=14274)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


0.0%[36m(_execute_read_task pid=14274)[0m 
0.0%[36m(_execute_read_task pid=14274)[0m 
0.3%[36m(_execute_read_task pid=14274)[0m 
0.5%[36m(_execute_read_task pid=14274)[0m 
1.2%[36m(_execute_read_task pid=14274)[0m 
2.5%[36m(_execute_read_task pid=14274)[0m 
2.5%[36m(_execute_read_task pid=14274)[0m 
2.5%[36m(_execute_read_task pid=14274)[0m 
2.6%[36m(_execute_read_task pid=14274)[0m 
4.3%[36m(_execute_read_task pid=14274)[0m 
6.2%[36m(_execute_read_task pid=14274)[0m 
7.3%[36m(_execute_read_task pid=14274)[0m 
7.3%[36m(_execute_read_task pid=14274)[0m 
7.4%[36m(_execute_read_task pid=14274)[0m 
7.4%[36m(_execute_read_task pid=14274)[0m 
9.4%[36m(_execute_read_task pid=14274)[0m 
9.4%[36m(_execute_read_task pid=14274)[0m 
10.9%[36m(_execute_read_task pid=14274)[0m 
10.9%[36m(_execute_read_task pid=14274)[0m 
10.9%[36m(_execute_read_task pid=14274)[0m 
11.0%[36m(_execute_read_task pid=14274)[0m 
11.0%[36m(_execute_read_task pid=14274)[0m 
11.1%[36m(

[2m[36m(_execute_read_task pid=14274)[0m Extracting ./data/cifar-10-python.tar.gz to ./data




[2m[36m(_execute_read_task pid=14274)[0m Files already downloaded and verified


Dataset(num_blocks=1, num_rows=50000, schema=<class 'tuple'>)

In [11]:
import pandas as pd


def convert_batch_to_pandas(batch):
    images = [image for image, _ in batch]
    labels = [label for _, label in batch]

    df = pd.DataFrame({"image": images, "label": labels})

    return df
    

train_dataset = train_dataset.map_batches(convert_batch_to_pandas)
test_dataset = test_dataset.map_batches(convert_batch_to_pandas)

train_dataset

Files already downloaded and verified


[2m[36m(raylet)[0m Spilled 2237 MiB, 75 objects, write throughput 651 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(_split_block pid=14559)[0m E0509 14:10:32.872928000 123145488416768 chttp2_transport.cc:1132]     Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


Files already downloaded and verified


Dataset(num_blocks=200, num_rows=50000, schema={image: object, label: int64})

## Train a convolutional neural network

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [5]:
from ray import train
import torch.optim as optim


def train_loop_per_worker(config):
    model = train.torch.prepare_model(Net())
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    train_dataset_shard = train.get_dataset_shard("train").to_torch(
        feature_columns=["image"],
        label_column="label",
        batch_size=config["batch_size"],
        unsqueeze_feature_tensors=False,
        unsqueeze_label_tensor=False
    )

    for epoch in range(2):
        running_loss = 0.0
        for i, data in enumerate(train_dataset_shard):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                running_loss = 0.0

        train.save_checkpoint(model=model.module.state_dict())

In [6]:
from ray.ml.train.integrations.torch import TorchTrainer

trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={"batch_size": 2},
    datasets={"train": train_dataset},
    scaling_config={"num_workers": 2}
)
result = trainer.fit()
latest_checkpoint = result.checkpoint

Trial name,status,loc
TorchTrainer_7fa02_00000,TERMINATED,127.0.0.1:14492


[2m[36m(BaseWorkerMixin pid=14521)[0m 2022-05-09 14:03:43,510	INFO torch.py:346 -- Setting up process group for: env:// [rank=0, world_size=2]
[2m[36m(BaseWorkerMixin pid=14522)[0m 2022-05-09 14:03:43,510	INFO torch.py:346 -- Setting up process group for: env:// [rank=1, world_size=2]
[2m[36m(BaseWorkerMixin pid=14521)[0m 2022-05-09 14:03:57,559	INFO torch.py:98 -- Moving model to device: cpu
[2m[36m(BaseWorkerMixin pid=14521)[0m 2022-05-09 14:03:57,559	INFO torch.py:132 -- Wrapping provided model in DDP.
[2m[36m(BaseWorkerMixin pid=14522)[0m 2022-05-09 14:03:57,559	INFO torch.py:98 -- Moving model to device: cpu
[2m[36m(BaseWorkerMixin pid=14522)[0m 2022-05-09 14:03:57,559	INFO torch.py:132 -- Wrapping provided model in DDP.


[2m[36m(BaseWorkerMixin pid=14521)[0m [1,  2000] loss: 2.259
[2m[36m(BaseWorkerMixin pid=14522)[0m [1,  2000] loss: 2.261
[2m[36m(BaseWorkerMixin pid=14521)[0m [1,  4000] loss: 1.870
[2m[36m(BaseWorkerMixin pid=14522)[0m [1,  4000] loss: 1.898
[2m[36m(BaseWorkerMixin pid=14521)[0m [1,  6000] loss: 1.706
[2m[36m(BaseWorkerMixin pid=14522)[0m [1,  6000] loss: 1.712
[2m[36m(BaseWorkerMixin pid=14521)[0m [1,  8000] loss: 1.626
[2m[36m(BaseWorkerMixin pid=14522)[0m [1,  8000] loss: 1.614
[2m[36m(BaseWorkerMixin pid=14521)[0m [1, 10000] loss: 1.527
[2m[36m(BaseWorkerMixin pid=14522)[0m [1, 10000] loss: 1.570
[2m[36m(BaseWorkerMixin pid=14521)[0m [1, 12000] loss: 1.470
[2m[36m(BaseWorkerMixin pid=14522)[0m [1, 12000] loss: 1.476
[2m[36m(BaseWorkerMixin pid=14521)[0m [2,  2000] loss: 1.437
[2m[36m(BaseWorkerMixin pid=14522)[0m [2,  2000] loss: 1.411
[2m[36m(BaseWorkerMixin pid=14521)[0m [2,  4000] loss: 1.408
[2m[36m(BaseWorkerMixin pid=14522)[0m

2022-05-09 14:06:04,518	ERROR checkpoint_manager.py:189 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']


Trial TorchTrainer_7fa02_00000 completed. Last result: 


2022-05-09 14:06:04,632	INFO tune.py:752 -- Total run time: 145.88 seconds (145.73 seconds for the tuning loop).


## Test the network on the test data

In [12]:
from ray.ml.predictors.integrations.torch import TorchPredictor
from ray.ml.batch_predictor import BatchPredictor

batch_predictor = BatchPredictor.from_checkpoint(
    checkpoint=latest_checkpoint,
    predictor_cls=TorchPredictor,
    model=Net(),
)
    
outputs = batch_predictor.predict(
    data=test_dataset, feature_columns=["image"], unsqueeze=False
)

outputs.show(1)


{'predictions': [-0.751593291759491, -2.1459906101226807, 0.5943943858146667, 1.7881542444229126, 0.2634425461292267, 0.37997013330459595, 1.4923861026763916, -1.0648168325424194, 0.024911552667617798, -1.5318701267242432]}


In [13]:
import numpy as np

def convert_logits_to_classes(df):
    best_class = df["predictions"].map(lambda x: np.array(x).argmax())
    df["prediction"] = best_class
    return df[["prediction"]]

predictions = outputs.map_batches(
    convert_logits_to_classes, batch_format="pandas"
)

predictions.show(1)

{'prediction': 3}


In [14]:
def calculate_prediction_scores(df):
    df["correct"] = df["prediction"] == df["label"]
    return df[["prediction", "label", "correct"]]

scores = test_dataset.zip(predictions).map_batches(calculate_prediction_scores)

scores.show(1)

{'prediction': 3, 'label': 3, 'correct': True}


In [15]:
scores.sum(on="correct") / scores.count()

0.5606

## What's next

TODO