# Hello Image Data

## Before you begin

## Load and normalize CIFAR-10

In [None]:

import ray
from ray.data.datasource import SimpleTorchDatasource
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

def train_dataset_factory():
    return torchvision.datasets.CIFAR10(root="./data", download=True, train=True, transform=transform)

def test_dataset_factory():
    return torchvision.datasets.CIFAR10(root="./data", download=True, train=False, transform=transform)

train_dataset = ray.data.read_datasource(SimpleTorchDatasource(), dataset_factory=train_dataset_factory)
test_dataset = ray.data.read_datasource(SimpleTorchDatasource(), dataset_factory=test_dataset_factory)

train_dataset

In [None]:
import pandas as pd


def convert_batch_to_pandas(batch):
    images = [image for image, _ in batch]
    labels = [label for _, label in batch]

    df = pd.DataFrame({"image": images, "label": labels})

    return df
    

train_dataset = train_dataset.map_batches(convert_batch_to_pandas)
test_dataset = test_dataset.map_batches(convert_batch_to_pandas)

train_dataset

## Train a convolutional neural network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
from ray import train
import torch.optim as optim


def train_loop_per_worker(config):
    model = train.torch.prepare_model(Net())
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    train_dataset_shard = train.get_dataset_shard("train").to_torch(
        feature_columns=["image"],
        label_column="label",
        batch_size=config["batch_size"],
        unsqueeze_feature_tensors=False,
        unsqueeze_label_tensor=False
    )

    for epoch in range(2):
        running_loss = 0.0
        for i, data in enumerate(train_dataset_shard):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                running_loss = 0.0

        train.save_checkpoint(model=model.module.state_dict())

In [None]:
from ray.ml.train.integrations.torch import TorchTrainer

trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={"batch_size": 2},
    datasets={"train": train_dataset},
    scaling_config={"num_workers": 2}
)
result = trainer.fit()
latest_checkpoint = result.checkpoint

## Test the network on the test data

In [None]:
from ray.ml.predictors.integrations.torch import TorchPredictor
from ray.ml.batch_predictor import BatchPredictor

batch_predictor = BatchPredictor.from_checkpoint(
    checkpoint=latest_checkpoint,
    predictor_cls=TorchPredictor,
    model=Net(),
)
    
outputs = batch_predictor.predict(
    data=test_dataset, feature_columns=["image"], unsqueeze=False
)

outputs.show(1)


In [None]:
import numpy as np

def convert_logits_to_classes(df):
    best_class = df["predictions"].map(lambda x: np.array(x).argmax())
    df["prediction"] = best_class
    return df[["prediction"]]

predictions = outputs.map_batches(
    convert_logits_to_classes, batch_format="pandas"
)

predictions.show(1)

In [None]:
def calculate_prediction_scores(df):
    df["correct"] = df["prediction"] == df["label"]
    return df[["prediction", "label", "correct"]]

scores = test_dataset.zip(predictions).map_batches(calculate_prediction_scores)

scores.show(1)

In [None]:
scores.sum(on="correct") / scores.count()

## Deploying the networking and making a prediciton

In [None]:
from ray import serve
from ray.serve.model_wrappers import ModelWrapperDeployment

# TODO

## What's next?