<a href="https://colab.research.google.com/github/secutron/Practice_Ignite/blob/main/2_1_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 기본 테스트

- osmu 테스트
- MNIST + Custom NN --> CIFAR10 + resnet18


In [None]:
import os

gpu_gtg = False
if int(os.environ.get("COLAB_GPU")) > 0:
    gpu_gtg = "COLAB_GPU" in os.environ

tpu_gtg = "COLAB_TPU_ADDR" in os.environ

if tpu_gtg: # tpu
    print("TPU")
    #VERSION = "nightly"

    # https://github.com/pytorch/builder/pull/750
    VERSION = "20210304" # was 20200607" 

    !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
    !python pytorch-xla-env-setup.py --version $VERSION

TPU
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5116  100  5116    0     0  38466      0 --:--:-- --:--:-- --:--:-- 38466
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20210304 ...
Collecting cloud-tpu-client
  Downloading https://files.pythonhosted.org/packages/56/9f/7b1958c2886db06feb5de5b2c191096f9e619914b6c31fdf93999fdbbd8b/cloud_tpu_client-0.10-py3-none-any.whl
Collecting google-api-python-client==1.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/9a/b4/a955f393b838bc47cbb6ae4643b9d0f90333d3b4db4dc1e819f36aad18cc/google_api_python_client-1.8.0-py3-none-any.whl (57kB)
[K     |████████████████████████████████| 61kB 3.6MB/s 
Uninstalling torch-1.9.0+cu102:
[31mERROR: earthengine-api 0.1.269 has requirement google-api-python-client<2,>=1.12.1, but you'll have google-api-python-client 1.8.0 which is incompatible.[0m
In

In [None]:
!pip install --pre pytorch-ignite

Collecting pytorch-ignite
[?25l  Downloading https://files.pythonhosted.org/packages/6d/f0/eb4def59742c097b10fa042eafe6134e54c9243116f8417dd2722944cbea/pytorch_ignite-0.5.0.dev20210705-py3-none-any.whl (227kB)
[K     |█▍                              | 10kB 14.2MB/s eta 0:00:01[K     |██▉                             | 20kB 20.1MB/s eta 0:00:01[K     |████▎                           | 30kB 10.5MB/s eta 0:00:01[K     |█████▊                          | 40kB 8.7MB/s eta 0:00:01[K     |███████▏                        | 51kB 5.6MB/s eta 0:00:01[K     |████████▋                       | 61kB 5.7MB/s eta 0:00:01[K     |██████████                      | 71kB 6.0MB/s eta 0:00:01[K     |███████████▌                    | 81kB 6.7MB/s eta 0:00:01[K     |█████████████                   | 92kB 6.7MB/s eta 0:00:01[K     |██████████████▍                 | 102kB 7.0MB/s eta 0:00:01[K     |███████████████▉                | 112kB 7.0MB/s eta 0:00:01[K     |█████████████████▎        

In [None]:
from datetime import datetime
from pathlib import Path

import torch
import torch.nn as nn
import torch.optim as optim

from torch.cuda.amp import GradScaler, autocast
from torchvision import datasets, models
from torchvision.transforms import Compose, Normalize, Pad, RandomCrop, RandomHorizontalFlip, ToTensor

from torch.utils.tensorboard import SummaryWriter

import ignite
import ignite.distributed as idist
from ignite.contrib.engines import common
from ignite.contrib.handlers import PiecewiseLinear
from ignite.engine import Engine, Events
from ignite.handlers import Checkpoint, DiskSaver, global_step_from_engine
from ignite.metrics import Accuracy, Loss
from ignite.utils import manual_seed, setup_logger

In [None]:
train_transform = Compose(
    [
        Pad(4),
        RandomCrop(32, fill=128),
        RandomHorizontalFlip(),
        ToTensor(),
        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]
)

test_transform = Compose([ToTensor(), Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),])


def get_train_test_datasets(path):
    if not os.path.exists(path):
        os.makedirs(path)
        download = True
    else:
        download = True if len(os.listdir(path)) < 1 else False

    train_dataset = datasets.CIFAR10(root=path, train=True, download=download, transform=train_transform)
    test_dataset = datasets.CIFAR10(root=path, train=False, download=False, transform=test_transform)

    return train_dataset, test_dataset

def get_data_loaders(config):
    if idist.get_local_rank() > 0:
        # Ensure that only local rank 0 download the dataset
        # Thus each node will download a copy of the dataset
        idist.barrier()

    data_transform = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))])

    train_dataset, test_dataset = get_train_test_datasets(config["data_path"])

    if idist.get_local_rank() == 0:
        # Ensure that only local rank 0 download the dataset
        idist.barrier()

    train_loader = idist.auto_dataloader(train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, drop_last=True,)
    val_loader = idist.auto_dataloader(test_dataset, batch_size=2 * config["batch_size"], num_workers=config["num_workers"], shuffle=False,)

    return train_loader, val_loader

In [None]:
def get_model(name):
    if name in models.__dict__:
        fn = models.__dict__[name]
    else:
        raise RuntimeError(f"Unknown model name {name}")

    return fn(num_classes=10)

In [None]:
def initialize(config):
    model = get_model(config["model"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler

In [None]:
def log_metrics(logger, epoch, elapsed, tag, metrics):
    metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()])
    logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed:.2f} - {tag} metrics:\n {metrics_output}")


def log_basic_info(logger, config):
    logger.info(f"Train {config['model']} on CIFAR10")
    logger.info(f"- PyTorch version: {torch.__version__}")
    logger.info(f"- Ignite version: {ignite.__version__}")
    if torch.cuda.is_available():
        # explicitly import cudnn as
        # torch.backends.cudnn can not be pickled with hvd spawning procs
        from torch.backends import cudnn

        logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}")
        logger.info(f"- CUDA version: {torch.version.cuda}")
        logger.info(f"- CUDNN version: {cudnn.version()}")

    logger.info("\n")
    logger.info("Configuration:")
    for key, value in config.items():
        logger.info(f"\t{key}: {value}")
    logger.info("\n")

    if idist.get_world_size() > 1:
        logger.info("\nDistributed setting:")
        logger.info(f"\tbackend: {idist.backend()}")
        logger.info(f"\tworld size: {idist.get_world_size()}")
        logger.info("\n")

In [None]:
def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, config, logger):

    device = idist.device()

    # Setup Ignite trainer:
    # - let's define training step
    # - add other common handlers:
    #    - TerminateOnNan,
    #    - handler to setup learning rate scheduling,
    #    - ModelCheckpoint
    #    - RunningAverage` on `train_step` output
    #    - Two progress bars on epochs and optionally on iterations

    with_amp = config["with_amp"]
    scaler = GradScaler(enabled=with_amp)

    def train_step(engine, batch):

        x, y = batch[0], batch[1]

        if x.device != device:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

        model.train()

        with autocast(enabled=with_amp):
            y_pred = model(x)
            loss = criterion(y_pred, y)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        return {
            "batch loss": loss.item(),
        }

    trainer = Engine(train_step)
    trainer.logger = logger

    to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
    metric_names = [
        "batch loss",
    ]

    common.setup_common_training_handlers(
        trainer=trainer,
        train_sampler=train_sampler,
        to_save=to_save,
        save_every_iters=config["checkpoint_every"],
        save_handler=get_save_handler(config),
        lr_scheduler=lr_scheduler,
        output_names=metric_names if config["log_every_iters"] > 0 else None,
        with_pbars=False,
        clear_cuda_cache=False,
    )

    resume_from = config["resume_from"]
    if resume_from is not None:
        checkpoint_fp = Path(resume_from)
        assert checkpoint_fp.exists(), f"Checkpoint '{checkpoint_fp.as_posix()}' is not found"
        logger.info(f"Resume from a checkpoint: {checkpoint_fp.as_posix()}")
        checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu")
        Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)

    return trainer


def create_evaluator(model, metrics, config, tag="val"):
    with_amp = config["with_amp"]
    device = idist.device()

    @torch.no_grad()
    def evaluate_step(engine: Engine, batch):
        model.eval()
        x, y = batch[0], batch[1]
        if x.device != device:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

        with autocast(enabled=with_amp):
            output = model(x)
        return output, y

    evaluator = Engine(evaluate_step)

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    if idist.get_rank() == 0:
        common.ProgressBar(desc=f"Evaluation ({tag})", persist=False).attach(evaluator)

    return evaluator

In [None]:
def get_save_handler(config):
    return DiskSaver(config["output_path"], require_empty=False)

In [None]:
def training(local_rank, config, **kwargs):
    print(idist.get_rank(), ': run with config:', config, '- backend=', idist.backend())

    rank = idist.get_rank()
    # https://pytorch.org/docs/stable/notes/randomness.html
    manual_seed(config["seed"] + rank)
    # random.seed(random_seed) # torchvision.transform 사용 시 잊지 말 것
    # torch.cuda.manual_seed(random_seed)
    # torch.cuda.manual_seed_all(random_seed
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_data_loaders(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()

In [None]:
config = {
    "seed": 543,
    "data_path" : "./cifar10",
    "output_path" : "./output-cifar10/",
    "model" : "resnet18",
    "batch_size" : 512,
    "momentum" : 0.9,
    "weight_decay" : 1e-4,
    "num_workers" : 12,
    "num_epochs" : 24,
    "learning_rate" : 0.4,
    "num_warmup_epochs" : 4,
    "validate_every" : 3, 
    "checkpoint_every" : 1000,
    "backend" : None, 
    "resume_from" : None, 
    "log_every_iters" : 15,
    "nproc_per_node" : None, 
    "stop_iteration" : None, 
    "with_amp" : False
}

gpu_gtg = torch.cuda.is_available()

if not (tpu_gtg or gpu_gtg): # cpu
    config["backend"] = 'gloo'
    config["nproc_per_node"] = 8
elif gpu_gtg: # gpu
    config["backend"] = 'nccl'
    config["nproc_per_node"] = 1
elif tpu_gtg: # tpu
    config["backend"] = 'xla-tpu'
    config["nproc_per_node"] = 8
else: # error
    raise RuntimeError("Unknown environment: tpu_gtg {}, gpu_gtg {}".format(tpu_gtg, gpu_gtg))

if config["backend"] == "xla-tpu" and config["with_amp"]:
    raise RuntimeError("The value of with_amp should be False if backend is xla")

dist_configs = {'nproc_per_node': config["nproc_per_node"], "start_method": "fork"}  # or dist_configs = {...}

#print(config)
#print(dist_configs)
#print(config["backend"])

with idist.Parallel(backend=config["backend"], **dist_configs) as parallel:
    parallel.run(training, config, a=1, b=2)

2021-07-05 07:00:09,365 ignite.distributed.launcher.Parallel INFO: Initialized distributed launcher with backend: 'xla-tpu'
2021-07-05 07:00:09,370 ignite.distributed.launcher.Parallel INFO: - Parameters to spawn processes: 
	nproc_per_node: 8
	nnodes: 1
	node_rank: 0
	start_method: fork
2021-07-05 07:00:09,372 ignite.distributed.launcher.Parallel INFO: Spawn function '<function training at 0x7fc168a7f170>' in 8 processes


0 : run with config: {'seed': 543, 'data_path': './cifar10', 'output_path': './output-cifar10/', 'model': 'resnet18', 'batch_size': 512, 'momentum': 0.9, 'weight_decay': 0.0001, 'num_workers': 12, 'num_epochs': 24, 'learning_rate': 0.4, 'num_warmup_epochs': 4, 'validate_every': 3, 'checkpoint_every': 1000, 'backend': 'xla-tpu', 'resume_from': None, 'log_every_iters': 15, 'nproc_per_node': 8, 'stop_iteration': None, 'with_amp': False} - backend= xla-tpu
6 : run with config: {'seed': 543, 'data_path': './cifar10', 'output_path': './output-cifar10/', 'model': 'resnet18', 'batch_size': 512, 'momentum': 0.9, 'weight_decay': 0.0001, 'num_workers': 12, 'num_epochs': 24, 'learning_rate': 0.4, 'num_warmup_epochs': 4, 'validate_every': 3, 'checkpoint_every': 1000, 'backend': 'xla-tpu', 'resume_from': None, 'log_every_iters': 15, 'nproc_per_node': 8, 'stop_iteration': None, 'with_amp': False} - backend= xla-tpu
4 : run with config: {'seed': 543, 'data_path': './cifar10', 'output_path': './output-

2021-07-05 07:00:28,082 CIFAR10-Training INFO: Train resnet18 on CIFAR10
2021-07-05 07:00:28,117 CIFAR10-Training INFO: - PyTorch version: 1.9.0a0+gitc4c77e2
2021-07-05 07:00:28,130 CIFAR10-Training INFO: - Ignite version: 0.5.0.dev20210705
2021-07-05 07:00:28,137 CIFAR10-Training INFO: 

2021-07-05 07:00:28,140 CIFAR10-Training INFO: Configuration:
2021-07-05 07:00:28,144 CIFAR10-Training INFO: 	seed: 543
2021-07-05 07:00:28,147 CIFAR10-Training INFO: 	data_path: ./cifar10


5 : run with config: {'seed': 543, 'data_path': './cifar10', 'output_path': './output-cifar10/', 'model': 'resnet18', 'batch_size': 512, 'momentum': 0.9, 'weight_decay': 0.0001, 'num_workers': 12, 'num_epochs': 24, 'learning_rate': 0.4, 'num_warmup_epochs': 4, 'validate_every': 3, 'checkpoint_every': 1000, 'backend': 'xla-tpu', 'resume_from': None, 'log_every_iters': 15, 'nproc_per_node': 8, 'stop_iteration': None, 'with_amp': False} - backend= xla-tpu


2021-07-05 07:00:28,150 CIFAR10-Training INFO: 	output_path: ./output-cifar10/
2021-07-05 07:00:28,158 CIFAR10-Training INFO: 	model: resnet18
2021-07-05 07:00:28,171 CIFAR10-Training INFO: 	batch_size: 512
2021-07-05 07:00:28,173 CIFAR10-Training INFO: 	momentum: 0.9
2021-07-05 07:00:28,175 CIFAR10-Training INFO: 	weight_decay: 0.0001
2021-07-05 07:00:28,182 CIFAR10-Training INFO: 	num_workers: 12
2021-07-05 07:00:28,185 CIFAR10-Training INFO: 	num_epochs: 24
2021-07-05 07:00:28,188 CIFAR10-Training INFO: 	learning_rate: 0.4
2021-07-05 07:00:28,191 CIFAR10-Training INFO: 	num_warmup_epochs: 4
2021-07-05 07:00:28,194 CIFAR10-Training INFO: 	validate_every: 3
2021-07-05 07:00:28,197 CIFAR10-Training INFO: 	checkpoint_every: 1000
2021-07-05 07:00:28,202 CIFAR10-Training INFO: 	backend: xla-tpu
2021-07-05 07:00:28,205 CIFAR10-Training INFO: 	resume_from: None
2021-07-05 07:00:28,210 CIFAR10-Training INFO: 	log_every_iters: 15
2021-07-05 07:00:28,213 CIFAR10-Training INFO: 	nproc_per_node:

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar10/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))


Extracting ./cifar10/cifar-10-python.tar.gz to ./cifar10


2021-07-05 07:00:37,027 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'Dataset CIFAR10': 
	{'batch_size': 64, 'num_workers': 2, 'drop_last': True, 'sampler': <torch.utils.data.distributed.DistributedSampler object at 0x7fc168a80e90>, 'pin_memory': False}
2021-07-05 07:00:37,047 ignite.distributed.auto.auto_dataloader INFO: DataLoader is wrapped by `MpDeviceLoader` on XLA
2021-07-05 07:00:37,061 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'Dataset CIFAR10': 
	{'batch_size': 128, 'num_workers': 2, 'sampler': <torch.utils.data.distributed.DistributedSampler object at 0x7fc161f88b10>, 'pin_memory': False}
2021-07-05 07:00:37,076 ignite.distributed.auto.auto_dataloader INFO: DataLoader is wrapped by `MpDeviceLoader` on XLA
2021-07-05 07:00:54,947 CIFAR10-Training INFO: Engine run starting with max_epochs=24.
2021-07-05 07:01:34,136 CIFAR10-Training INFO: Epoch[1] Complete. Time taken: 00:00:39
2021-07-05 07:02:05,283 CI

HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:03:00,600 CIFAR10-Training INFO: 
Epoch 3 - Evaluation time (seconds): 24.52 - Train metrics:
 	Accuracy: 0.4245932667525773
	Loss: 1.612633144732603




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:03:06,305 CIFAR10-Training INFO: 
Epoch 3 - Evaluation time (seconds): 5.68 - Test metrics:
 	Accuracy: 0.4315
	Loss: 1.673626953125
2021-07-05 07:03:06,308 CIFAR10-Training INFO: Epoch[3] Complete. Time taken: 00:01:01




2021-07-05 07:03:37,925 CIFAR10-Training INFO: Epoch[4] Complete. Time taken: 00:00:32
2021-07-05 07:04:08,819 CIFAR10-Training INFO: Epoch[5] Complete. Time taken: 00:00:31


HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:05:03,093 CIFAR10-Training INFO: 
Epoch 6 - Evaluation time (seconds): 23.76 - Train metrics:
 	Accuracy: 0.6039586018041238
	Loss: 1.099551879253584




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:05:06,585 CIFAR10-Training INFO: 
Epoch 6 - Evaluation time (seconds): 3.48 - Test metrics:
 	Accuracy: 0.5965
	Loss: 1.1472626953125
2021-07-05 07:05:06,604 CIFAR10-Training INFO: Epoch[6] Complete. Time taken: 00:00:58




2021-07-05 07:05:41,926 CIFAR10-Training INFO: Epoch[7] Complete. Time taken: 00:00:35
2021-07-05 07:06:13,760 CIFAR10-Training INFO: Epoch[8] Complete. Time taken: 00:00:32


HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:07:10,155 CIFAR10-Training INFO: 
Epoch 9 - Evaluation time (seconds): 24.69 - Train metrics:
 	Accuracy: 0.6692372744845361
	Loss: 0.9284247958783022




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:07:13,806 CIFAR10-Training INFO: 
Epoch 9 - Evaluation time (seconds): 3.62 - Test metrics:
 	Accuracy: 0.6683
	Loss: 0.94412060546875
2021-07-05 07:07:13,832 CIFAR10-Training INFO: Epoch[9] Complete. Time taken: 00:01:00




2021-07-05 07:07:45,500 CIFAR10-Training INFO: Epoch[10] Complete. Time taken: 00:00:32
2021-07-05 07:08:17,722 CIFAR10-Training INFO: Epoch[11] Complete. Time taken: 00:00:32


HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:09:14,103 CIFAR10-Training INFO: 
Epoch 12 - Evaluation time (seconds): 24.77 - Train metrics:
 	Accuracy: 0.750865818298969
	Loss: 0.7104913770538015




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:09:17,677 CIFAR10-Training INFO: 
Epoch 12 - Evaluation time (seconds): 3.55 - Test metrics:
 	Accuracy: 0.7467
	Loss: 0.744965576171875
2021-07-05 07:09:17,695 CIFAR10-Training INFO: Epoch[12] Complete. Time taken: 00:00:60




2021-07-05 07:09:49,393 CIFAR10-Training INFO: Epoch[13] Complete. Time taken: 00:00:32
2021-07-05 07:10:21,500 CIFAR10-Training INFO: Epoch[14] Complete. Time taken: 00:00:32


HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:11:18,168 CIFAR10-Training INFO: 
Epoch 15 - Evaluation time (seconds): 24.75 - Train metrics:
 	Accuracy: 0.7788136275773195
	Loss: 0.6301673810506604




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:11:22,294 CIFAR10-Training INFO: 
Epoch 15 - Evaluation time (seconds): 4.11 - Test metrics:
 	Accuracy: 0.76
	Loss: 0.700236865234375
2021-07-05 07:11:22,307 CIFAR10-Training INFO: Epoch[15] Complete. Time taken: 00:01:01




2021-07-05 07:11:53,894 CIFAR10-Training INFO: Epoch[16] Complete. Time taken: 00:00:32
2021-07-05 07:12:25,454 CIFAR10-Training INFO: Epoch[17] Complete. Time taken: 00:00:32


HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:13:21,982 CIFAR10-Training INFO: 
Epoch 18 - Evaluation time (seconds): 24.74 - Train metrics:
 	Accuracy: 0.7959487757731959
	Loss: 0.5769782705405324




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:13:26,165 CIFAR10-Training INFO: 
Epoch 18 - Evaluation time (seconds): 4.17 - Test metrics:
 	Accuracy: 0.7775
	Loss: 0.65168720703125
2021-07-05 07:13:26,177 CIFAR10-Training INFO: Epoch[18] Complete. Time taken: 00:01:01




2021-07-05 07:13:57,786 CIFAR10-Training INFO: Epoch[19] Complete. Time taken: 00:00:32
2021-07-05 07:14:29,567 CIFAR10-Training INFO: Epoch[20] Complete. Time taken: 00:00:32


HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:15:26,526 CIFAR10-Training INFO: 
Epoch 21 - Evaluation time (seconds): 24.76 - Train metrics:
 	Accuracy: 0.8257087628865979
	Loss: 0.49511549644863484




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:15:30,619 CIFAR10-Training INFO: 
Epoch 21 - Evaluation time (seconds): 4.07 - Test metrics:
 	Accuracy: 0.8022
	Loss: 0.580840234375
2021-07-05 07:15:30,635 CIFAR10-Training INFO: Epoch[21] Complete. Time taken: 00:01:01




2021-07-05 07:16:07,069 CIFAR10-Training INFO: Epoch[22] Complete. Time taken: 00:00:36
2021-07-05 07:16:39,078 CIFAR10-Training INFO: Epoch[23] Complete. Time taken: 00:00:32


HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:17:35,813 CIFAR10-Training INFO: 
Epoch 24 - Evaluation time (seconds): 24.65 - Train metrics:
 	Accuracy: 0.8399041559278351
	Loss: 0.45360384282377575




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:17:39,857 CIFAR10-Training INFO: 
Epoch 24 - Evaluation time (seconds): 4.03 - Test metrics:
 	Accuracy: 0.8112
	Loss: 0.54418720703125
2021-07-05 07:17:39,873 CIFAR10-Training INFO: Epoch[24] Complete. Time taken: 00:01:01




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=97.0, style=ProgressStyle(descript…

2021-07-05 07:18:04,713 CIFAR10-Training INFO: 
Epoch 24 - Evaluation time (seconds): 24.82 - Train metrics:
 	Accuracy: 0.8413740335051546
	Loss: 0.45592994296673645




HBox(children=(FloatProgress(value=0.0, description='Evaluation (val)', max=10.0, style=ProgressStyle(descript…

2021-07-05 07:18:08,705 CIFAR10-Training INFO: 
Epoch 24 - Evaluation time (seconds): 3.97 - Test metrics:
 	Accuracy: 0.8112
	Loss: 0.54418720703125
2021-07-05 07:18:08,730 CIFAR10-Training INFO: Engine run complete. Time taken: 00:17:14




2021-07-05 07:18:08,843 ignite.distributed.launcher.Parallel INFO: End of run


In [None]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 85
model name	: Intel(R) Xeon(R) CPU @ 2.00GHz
stepping	: 3
microcode	: 0x1
cpu MHz		: 2000.206
cache size	: 39424 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: