In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [3]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [4]:
# Get cpu or gpu device for training.
device = "cpu"
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [5]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [6]:
def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [7]:
def test_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [8]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_epoch(train_dataloader, model, loss_fn, optimizer)
    test_epoch(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------


loss: 2.295725  [    0/60000]
loss: 2.288331  [ 6400/60000]
loss: 2.269881  [12800/60000]
loss: 2.267963  [19200/60000]
loss: 2.246048  [25600/60000]
loss: 2.210479  [32000/60000]
loss: 2.222671  [38400/60000]
loss: 2.175079  [44800/60000]
loss: 2.179878  [51200/60000]
loss: 2.139994  [57600/60000]
Test Error: 
 Accuracy: 40.2%, Avg loss: 2.138721 

Epoch 2
-------------------------------
loss: 2.147133  [    0/60000]
loss: 2.136176  [ 6400/60000]
loss: 2.081316  [12800/60000]
loss: 2.098356  [19200/60000]
loss: 2.039676  [25600/60000]
loss: 1.977533  [32000/60000]
loss: 2.004722  [38400/60000]
loss: 1.917204  [44800/60000]
loss: 1.934661  [51200/60000]
loss: 1.839513  [57600/60000]
Test Error: 
 Accuracy: 59.8%, Avg loss: 1.847508 

Epoch 3
-------------------------------
loss: 1.884295  [    0/60000]
loss: 1.844624  [ 6400/60000]
loss: 1.736727  [12800/60000]
loss: 1.773564  [19200/60000]
loss: 1.658612  [25600/60000]
loss: 1.621730  [32000/60000]
loss: 1.634687  [38400/60000]
loss: 

In [9]:
def train_func():
    batch_size = 64
    lr = 1e-3
    epochs = 5
    
    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    
    # Get cpu or gpu device for training.
    device = "cpu"
    print(f"Using {device} device")
    
    model = NeuralNetwork().to(device)
    print(model)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        test_epoch(test_dataloader, model, loss_fn)

    print("Done!")

In [10]:
train_func()

Using cpu device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
Epoch 1
-------------------------------
loss: 2.304617  [    0/60000]
loss: 2.285371  [ 6400/60000]
loss: 2.271165  [12800/60000]
loss: 2.259961  [19200/60000]
loss: 2.240443  [25600/60000]
loss: 2.215926  [32000/60000]
loss: 2.221008  [38400/60000]
loss: 2.191331  [44800/60000]
loss: 2.185502  [51200/60000]
loss: 2.148566  [57600/60000]
Test Error: 
 Accuracy: 46.2%, Avg loss: 2.147187 

Epoch 2
-------------------------------
loss: 2.163097  [    0/60000]
loss: 2.146952  [ 6400/60000]
loss: 2.094553  [12800/60000]
loss: 2.103191  [19200/60000]
loss: 2.054648  [25600/60000]
loss: 1.997787  [32000/60000]
loss: 2.013405  [38400/60000]
loss: 1.942909  [44800

In [11]:
import ray.train as train
from ray.air import session

In [12]:
def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset) // session.get_world_size()  # Divide by word size
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)

In [13]:
def test_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset) // session.get_world_size()  # Divide by word size
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [14]:
import ray.train as train
from ray.air import session

def train_func(config: dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]
    
    batch_size_per_worker = batch_size // session.get_world_size()
    
    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size_per_worker)
    test_dataloader = DataLoader(test_data, batch_size=batch_size_per_worker)
    
    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)
    
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for t in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        test_loss = test_epoch(test_dataloader, model, loss_fn)
        from ray.air import Checkpoint
        checkpoint = Checkpoint.from_dict(
            dict(epoch=t, model=model.state_dict())
        )
        session.report(dict(loss=test_loss), checkpoint=checkpoint)
    print("Done!")

In [15]:
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig


trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
)
result = trainer.fit()
print(f"Last result: {result.metrics}")

2023-07-03 19:27:46,045	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2023-07-03 19:27:48,703	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Trainer(...)`.


0,1
Current time:,2023-07-03 19:29:02
Running for:,00:01:14.19
Memory:,3.6/7.7 GiB

Trial name,status,loc,iter,total time (s),loss
TorchTrainer_41ed3_00000,TERMINATED,172.26.215.93:882230,4,66.8926,1.20098


[2m[36m(TorchTrainer pid=882230)[0m 2023-07-03 19:27:56,665	INFO backend_executor.py:137 -- Starting distributed worker processes: ['882300 (172.26.215.93)', '882301 (172.26.215.93)']
[2m[36m(RayTrainWorker pid=882300)[0m 2023-07-03 19:27:58,741	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=2]
[2m[36m(RayTrainWorker pid=882300)[0m 2023-07-03 19:27:59,299	INFO train_loop_utils.py:286 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=882300)[0m 2023-07-03 19:27:59,300	INFO train_loop_utils.py:346 -- Wrapping provided model in DistributedDataParallel.


Trial name,date,done,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
TorchTrainer_41ed3_00000,2023-07-03_19-29-00,True,0,DESKTOP-0P789CI,4,1.20098,172.26.215.93,882230,66.8926,15.2581,66.8926,1688380140,4,41ed3_00000


[2m[36m(RayTrainWorker pid=882301)[0m Done!


2023-07-03 19:29:02,969	INFO tune.py:1111 -- Total run time: 74.26 seconds (74.18 seconds for the tuning loop).


Last result: {'loss': 1.2009836940249061, 'timestamp': 1688380140, 'time_this_iter_s': 15.25805377960205, 'done': True, 'training_iteration': 4, 'trial_id': '41ed3_00000', 'date': '2023-07-03_19-29-00', 'time_total_s': 66.8925530910492, 'pid': 882230, 'hostname': 'DESKTOP-0P789CI', 'node_ip': '172.26.215.93', 'config': {'train_loop_config': {'lr': 0.001, 'batch_size': 64, 'epochs': 4}}, 'time_since_restore': 66.8925530910492, 'iterations_since_restore': 4, 'experiment_tag': '0'}


In [16]:
from ray.air import Checkpoint

def load_data():
    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )
    return training_data, test_data


def train_func(config: dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]
    
    batch_size_per_worker = batch_size // session.get_world_size()
    
    training_data, test_data = load_data()  # <- this is new!
    
    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size_per_worker)
    test_dataloader = DataLoader(test_data, batch_size=batch_size_per_worker)
    
    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)
    
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for t in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        test_loss = test_epoch(test_dataloader, model, loss_fn)
        checkpoint = Checkpoint.from_dict(
            dict(epoch=t, model=model.state_dict())
        )
        session.report(dict(loss=test_loss), checkpoint=checkpoint)

    print("Done!")

In [17]:
trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
)
result = trainer.fit()

0,1
Current time:,2023-07-03 19:39:05
Running for:,00:01:37.70
Memory:,3.6/7.7 GiB

Trial name,status,loc,iter,total time (s),loss
TorchTrainer_9b5e4_00000,TERMINATED,172.26.215.93:882712,4,92.1065,1.25864


[2m[36m(TorchTrainer pid=882712)[0m 2023-07-03 19:37:33,985	INFO backend_executor.py:137 -- Starting distributed worker processes: ['882758 (172.26.215.93)', '882759 (172.26.215.93)']
[2m[36m(RayTrainWorker pid=882758)[0m 2023-07-03 19:37:35,431	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=2]


[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[2m[36m(RayTrainWorker pid=882300)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]
  0%|          | 32768/26421880 [00:00<03:20, 131516.75it/s]
  0%|          | 65536/26421880 [00:00<03:14, 135401.48it/s]
  2%|▏         | 425984/26421880 [00:05<03:35, 120786.28it/s]
 62%|██████▏   | 16252928/26421880 [00:05<00:03, 3136056.27it/s][32m [repeated 41x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
 91%|█████████▏| 24150016/26421880 [00:07<00:00, 3161171.50it/s]
 93%|█████████▎| 24576000/26421880 [00:07<00:00, 3422442.83it/s]
 94%|█████████▍| 24936448/26421880 [00:08<00:00, 3194646.45it/s]
 96%|█████████▌| 25329664/26421880 [00:08<00:00, 3354793.09it/s]
 97%|█████████▋| 25722880/26421880 [00:08<00:00, 3167942.36it/s]
100%|██████████| 26421880/26421880 [00:08<00:00, 3130820.34it/s]


[2m[36m(RayTrainWorker pid=882758)[0m Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882758)[0m Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882758)[0m Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882758)[0m 
[2m[36m(RayTrainWorker pid=882758)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
[2m[36m(RayTrainWorker pid=882758)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]
  5%|▍         | 1277952/26421880 [00:10<01:49, 230057.44it/s][32m [repeated 43x across cluster][0m
100%|██████████| 29515/29515 [00:00<00:00, 59517.37it/s]


[2m[36m(RayTrainWorker pid=882758)[0m Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882758)[0m 
[2m[36m(RayTrainWorker pid=882758)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
[2m[36m(RayTrainWorker pid=882758)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]
 21%|██        | 5537792/26421880 [00:15<00:12, 1713041.40it/s][32m [repeated 47x across cluster][0m
 54%|█████▍    | 14286848/26421880 [00:20<00:06, 1961187.41it/s][32m [repeated 62x across cluster][0m
 92%|█████████▏| 4063232/4422102 [00:09<00:00, 1338036.13it/s]
 99%|█████████▊| 4358144/4422102 [00:09<00:00, 1627569.28it/s]


[2m[36m(RayTrainWorker pid=882758)[0m Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882758)[0m 
[2m[36m(RayTrainWorker pid=882758)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz


100%|██████████| 4422102/4422102 [00:09<00:00, 464521.46it/s] 


[2m[36m(RayTrainWorker pid=882758)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
[2m[36m(RayTrainWorker pid=882758)[0m Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882758)[0m 


100%|██████████| 5148/5148 [00:00<00:00, 12096513.72it/s]
[2m[36m(RayTrainWorker pid=882758)[0m 2023-07-03 19:37:58,896	INFO train_loop_utils.py:286 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=882758)[0m 2023-07-03 19:37:58,897	INFO train_loop_utils.py:346 -- Wrapping provided model in DistributedDataParallel.
 84%|████████▍ | 22282240/26421880 [00:25<00:02, 1594864.60it/s][32m [repeated 37x across cluster][0m
 92%|█████████▏| 24182784/26421880 [00:26<00:01, 1368264.77it/s]
 92%|█████████▏| 24412160/26421880 [00:27<00:01, 1473449.02it/s]
 93%|█████████▎| 24576000/26421880 [00:27<00:01, 1439784.03it/s]
 94%|█████████▍| 24805376/26421880 [00:27<00:01, 1552220.03it/s]
 95%|█████████▍| 24969216/26421880 [00:27<00:00, 1478060.73it/s]
 95%|█████████▌| 25165824/26421880 [00:27<00:00, 1558619.38it/s]
 96%|█████████▌| 25329664/26421880 [00:27<00:00, 1475959.70it/s]
 97%|█████████▋| 25559040/26421880 [00:27<00:00, 1591292.05it/s]
 97%|█████████▋| 25722880/26421880 [00:27<00

[2m[36m(RayTrainWorker pid=882759)[0m Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882759)[0m 
[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]
100%|██████████| 29515/29515 [00:00<00:00, 114225.74it/s]


[2m[36m(RayTrainWorker pid=882759)[0m Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882759)[0m 
[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]
 91%|█████████ | 24018944/26421880 [00:26<00:01, 1486832.38it/s][32m [repeated 9x across cluster][0m
100%|██████████| 4422102/4422102 [00:02<00:00, 1984761.74it/s]


[2m[36m(RayTrainWorker pid=882759)[0m Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882759)[0m 
[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
[2m[36m(RayTrainWorker pid=882759)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 19522854.42it/s]


[2m[36m(RayTrainWorker pid=882759)[0m Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=882759)[0m 


Trial name,date,done,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
TorchTrainer_9b5e4_00000,2023-07-03_19-39-03,True,0,DESKTOP-0P789CI,4,1.25864,172.26.215.93,882712,True,92.1065,14.8636,92.1065,1688380743,4,9b5e4_00000


[2m[36m(RayTrainWorker pid=882759)[0m Done!


2023-07-03 19:39:05,998	INFO tune.py:1111 -- Total run time: 97.73 seconds (97.70 seconds for the tuning loop).


In [18]:
print(f"Last result: {result.metrics}")
print(f"Checkpoint: {result.checkpoint}")

Last result: {'loss': 1.258639681491123, 'timestamp': 1688380743, 'time_this_iter_s': 14.863616228103638, 'should_checkpoint': True, 'done': True, 'training_iteration': 4, 'trial_id': '9b5e4_00000', 'date': '2023-07-03_19-39-03', 'time_total_s': 92.10650444030762, 'pid': 882712, 'hostname': 'DESKTOP-0P789CI', 'node_ip': '172.26.215.93', 'config': {'train_loop_config': {'lr': 0.001, 'batch_size': 64, 'epochs': 4}}, 'time_since_restore': 92.10650444030762, 'iterations_since_restore': 4, 'experiment_tag': '0'}
Checkpoint: TorchCheckpoint(local_path=/home/seokj/ray_results/TorchTrainer_2023-07-03_19-37-28/TorchTrainer_9b5e4_00000_0_2023-07-03_19-37-28/checkpoint_000003)


In [19]:
def predict_from_model(model):
    classes = [
        "T-shirt/top",
        "Trouser",
        "Pullover",
        "Dress",
        "Coat",
        "Sandal",
        "Shirt",
        "Sneaker",
        "Bag",
        "Ankle boot",
    ]

    model.eval()
    x, y = test_data[0][0], test_data[0][1]
    with torch.no_grad():
        pred = model(x)
        predicted, actual = classes[pred[0].argmax(0)], classes[y]
        print(f'Predicted: "{predicted}", Actual: "{actual}"')

In [20]:
from ray.train.torch import TorchCheckpoint

model = TorchCheckpoint.from_checkpoint(result.checkpoint).get_model(NeuralNetwork())

predict_from_model(model)

Predicted: "Ankle boot", Actual: "Ankle boot"


In [21]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

def predict_from_model(model, data):
    model.eval()
    with torch.no_grad():
        for x, y in data:
            pred = model(x)
            predicted, actual = classes[pred[0].argmax(0)], classes[y]
            print(f'Predicted: "{predicted}", Actual: "{actual}"')

In [22]:
predict_from_model(model, [test_data[i] for i in range(10)])

Predicted: "Ankle boot", Actual: "Ankle boot"
Predicted: "Pullover", Actual: "Pullover"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Pullover", Actual: "Shirt"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Pullover", Actual: "Coat"
Predicted: "Coat", Actual: "Shirt"
Predicted: "Sneaker", Actual: "Sandal"
Predicted: "Sneaker", Actual: "Sneaker"


In [23]:
from ray.train.batch_predictor import BatchPredictor
from ray.train.torch import TorchPredictor

batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor, model=NeuralNetwork())

In [29]:
test_data

Dataset FashionMNIST
    Number of datapoints: 10000
    Root location: data
    Split: Test
    StandardTransform
Transform: ToTensor()

In [24]:
import ray.data

ds = ray.data.from_items([x.numpy() for x, y in test_data], parallelism=8)


Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


In [25]:
results = batch_predictor.predict(ds, batch_size=32, min_scoring_workers=2)

In [26]:
results.show()

2023-07-03 19:39:07,804	INFO dataset.py:2087 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2023-07-03 19:39:07,812	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(ScoringWrapper)]
2023-07-03 19:39:07,813	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-07-03 19:39:07,816	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-07-03 19:39:07,888	INFO actor_pool_map_operator.py:114 -- MapBatches(ScoringWrapper): Waiting for 2 pool actors to start...


Running 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-07-03 19:39:11,979	INFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-472, started daemon 140284872603200)>.


{'predictions': array([-1.4564618 , -1.6455319 , -0.5546917 , -1.4862297 , -0.67328656,
        1.3009992 , -0.76655257,  1.6294042 ,  1.2306256 ,  2.0893838 ],
      dtype=float32)}
{'predictions': array([ 0.529662  , -2.3841782 ,  2.6290684 , -1.1644136 ,  2.4346154 ,
       -1.7986703 ,  1.8237766 , -2.2549539 ,  0.91606885, -1.2686421 ],
      dtype=float32)}
{'predictions': array([ 1.6924237 ,  3.7445989 , -0.16596645,  2.7232094 ,  0.37961888,
       -1.8623419 ,  0.5047344 , -2.215086  , -1.457871  , -2.498712  ],
      dtype=float32)}
{'predictions': array([ 1.1999766 ,  2.8721836 , -0.22619268,  2.152885  ,  0.2031754 ,
       -1.3261802 ,  0.32827246, -1.6174845 , -1.1585081 , -1.8407434 ],
      dtype=float32)}
{'predictions': array([ 0.70811534, -0.9485282 ,  1.0355327 , -0.23830125,  0.9868727 ,
       -0.96967053,  0.9683471 , -1.3278077 ,  0.3286457 , -0.7888265 ],
      dtype=float32)}
{'predictions': array([ 1.6928362 ,  2.575294  ,  0.2620929 ,  2.0942364 ,  0.6812103

In [27]:
predicted_classes = results.map_batches(
    lambda batch: {"pred": [classes[pred.argmax(0)] for pred in batch["predictions"]]}, 
    batch_size=32,
    batch_format="pandas")

In [28]:
real_classes = [classes[y] for x, y in test_data]
for predicted, real in zip(predicted_classes.take_batch()["pred"], real_classes):
    print((predicted, real))

2023-07-03 19:43:26,920	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(ScoringWrapper)] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-07-03 19:43:26,921	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-07-03 19:43:26,922	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-07-03 19:43:26,970	INFO actor_pool_map_operator.py:114 -- MapBatches(ScoringWrapper): Waiting for 2 pool actors to start...


Running 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-07-03 19:43:32,080	INFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-528, started daemon 140284855817792)>.


('Ankle boot', 'Ankle boot')
('Pullover', 'Pullover')
('Trouser', 'Trouser')
('Trouser', 'Trouser')
('Pullover', 'Shirt')
('Trouser', 'Trouser')
('Pullover', 'Coat')
('Coat', 'Shirt')
('Sneaker', 'Sandal')
('Sneaker', 'Sneaker')
('Pullover', 'Coat')
('Ankle boot', 'Sandal')
('Sneaker', 'Sneaker')
('Dress', 'Dress')
('Coat', 'Coat')
('Trouser', 'Trouser')
('Pullover', 'Pullover')
('Pullover', 'Coat')
('Bag', 'Bag')
('T-shirt/top', 'T-shirt/top')
