<a href="https://colab.research.google.com/github/secutron/Practice_Ignite/blob/main/A_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os

gpu_gtg = False
if int(os.environ.get("COLAB_GPU")) > 0:
    gpu_gtg = "COLAB_GPU" in os.environ

tpu_gtg = "COLAB_TPU_ADDR" in os.environ

if tpu_gtg: # tpu
    print("TPU")
    #VERSION = "nightly"

    # https://github.com/pytorch/builder/pull/750
    VERSION = "20210304" # was 20200607" 

    !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
    !python pytorch-xla-env-setup.py --version $VERSION

TPU
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  5116  100  5116    0     0   106k      0 --:--:-- --:--:-- --:--:--  106k
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20210304 ...
Found existing installation: torch 1.9.0a0+gitc4c77e2
Uninstalling torch-1.9.0a0+gitc4c77e2:
  Successfully uninstalled torch-1.9.0a0+gitc4c77e2
Found existing installation: torchvision 0.9.0a0+7d41547
Uninstalling torchvision-0.9.0a0+7d41547:
  Successfully uninstalled torchvision-0.9.0a0+7d41547
Copying gs://tpu-pytorch/wheels/torch-nightly+20210304-cp37-cp37m-linux_x86_64.whl...
\ [1 files][126.5 MiB/126.5 MiB]                                                
Operation completed over 1 objects/126.5 M

In [6]:
!pip install --pre pytorch-ignite



In [7]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models

import torchsummary

import ignite
import ignite.distributed as idist
from ignite.engine import Engine, Events, create_supervised_evaluator, create_supervised_trainer
from ignite.metrics import Accuracy, Loss, RunningAverage, ConfusionMatrix
from ignite.handlers import ModelCheckpoint, EarlyStopping
from ignite.utils import setup_logger

In [8]:
from ignite.handlers import Timer

In [9]:
def training(local_rank, config, **kwargs):
    print("local rank: ", local_rank)

    ###########################################################
    # 데이터 준비
    train_transform = transforms.Compose(
        [
            transforms.Pad(4),
            transforms.RandomCrop(32, fill=128),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ]
    )

    test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),])

    if idist.get_local_rank() > 0:
        idist.barrier()

    trainset = torchvision.datasets.CIFAR10(root=config["data_path"], train=True, download=True, transform=train_transform)
    testset = torchvision.datasets.CIFAR10(root=config["data_path"], train=False, download=True, transform=test_transform)

    if idist.get_local_rank() == 0:
        idist.barrier()

    trainloader = idist.auto_dataloader(trainset, batch_size=config["batch_size"], shuffle=True, num_workers=config["num_workers"], drop_last=True)
    testloader = idist.auto_dataloader(testset, batch_size=config["batch_size"], shuffle=False, num_workers=config["num_workers"],)


    ###########################################################
    # 모델, 옵티마이저, 로스, 트레이너, 이밸류에이터
    num_classes = 10
    model = models.resnet18(num_classes = num_classes)
       
    model = idist.auto_model(model)
    optimizer = idist.auto_optim(optim.Adam(model.parameters(), lr=0.001))

    criterion = nn.CrossEntropyLoss().to(idist.device())

    trainer = create_supervised_trainer(model, optimizer, criterion, device=idist.device())
    trainer.logger = setup_logger("hkim-trainer")



    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)




    metrics = {
        'accuracy':Accuracy(),
        'ce':Loss(criterion),
    }

    val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=idist.device())
    val_evaluator.logger = setup_logger("hkim-val_evaluator")

    # track a running average of the scalar loss output for each batch.
    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

    ###########################################################
    # 이벤트

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        state = val_evaluator.run(testloader)
        metrics = val_evaluator.state.metrics
        accuracy = metrics['accuracy']*100
        loss = metrics['ce']
        log_metrics(val_evaluator.logger, state.epoch, state.times["COMPLETED"], "validation evaluator", state.metrics)

    @trainer.on(Events.COMPLETED)
    def log_profile_results(trainer):
        print(f"- Mean elapsed time for 1 epoch: {timer.value()}")

    trainer.run(trainloader, max_epochs=config["num_epochs"])    

In [10]:
config = {
    "seed": 543,
    "data_path" : "./cifar10",
    "output_path" : "./output-cifar10/",
    "model" : "resnet18",
    "batch_size" : 512,
    "momentum" : 0.9,
    "weight_decay" : 1e-4,
    "num_workers" : 2,
    "num_epochs" : 5,
    "learning_rate" : 0.4,
    "num_warmup_epochs" : 4,
    "validate_every" : 3, 
    "checkpoint_every" : 1000,
    "backend" : None, 
    "resume_from" : None, 
    "log_every_iters" : 15,
    "nproc_per_node" : None, 
    "stop_iteration" : None, 
    "with_amp" : False,
    "log_interval" : 10,
    "verbose_set" : False,
    "verbose_set2" : False,
    "verbose_loader" : False

}

if not (tpu_gtg or gpu_gtg): # cpu
    config["backend"] = 'gloo'
    config["nproc_per_node"] = 8
elif gpu_gtg: # gpu
    config["backend"] = 'nccl'
    config["nproc_per_node"] = 1
elif tpu_gtg: # tpu
    config["backend"] = 'xla-tpu'
    config["nproc_per_node"] = 8
else: # error
    raise RuntimeError("Unknown environment: tpu_gtg {}, gpu_gtg {}".format(tpu_gtg, gpu_gtg))

if config["backend"] == "xla-tpu" and config["with_amp"]:
    raise RuntimeError("The value of with_amp should be False if backend is xla")


dist_configs = {'nproc_per_node': config["nproc_per_node"], "start_method": "fork"}  

def log_metrics(logger, epoch, elapsed, tag, metrics):
    metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()])
    logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed:.2f} - {tag} metrics:\n {metrics_output}")

with idist.Parallel(backend=config["backend"], **dist_configs) as parallel:
    parallel.run(training, config, a=1, b=1)

2021-09-14 04:02:51,099 ignite.distributed.launcher.Parallel INFO: Initialized distributed launcher with backend: 'xla-tpu'
2021-09-14 04:02:51,101 ignite.distributed.launcher.Parallel INFO: - Parameters to spawn processes: 
	nproc_per_node: 8
	nnodes: 1
	node_rank: 0
	start_method: fork
2021-09-14 04:02:51,102 ignite.distributed.launcher.Parallel INFO: Spawn function '<function training at 0x7f1de98a20e0>' in 8 processes


local rank:  2
local rank:  7
local rank:  4
local rank:  1
local rank:  3
local rank:  5
local rank:  6
local rank:  0
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./cifar10/cifar-10-python.tar.gz to ./cifar10
Files already downloaded and verified


2021-09-14 04:03:19,271 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'Dataset CIFAR10': 
	{'batch_size': 64, 'num_workers': 2, 'drop_last': True, 'sampler': <torch.utils.data.distributed.DistributedSampler object at 0x7f1de4648190>, 'pin_memory': False}
2021-09-14 04:03:19,286 ignite.distributed.auto.auto_dataloader INFO: DataLoader is wrapped by `MpDeviceLoader` on XLA
2021-09-14 04:03:19,307 ignite.distributed.auto.auto_dataloader INFO: Use data loader kwargs for dataset 'Dataset CIFAR10': 
	{'batch_size': 64, 'num_workers': 2, 'sampler': <torch.utils.data.distributed.DistributedSampler object at 0x7f1de4616090>, 'pin_memory': False}
2021-09-14 04:03:19,328 ignite.distributed.auto.auto_dataloader INFO: DataLoader is wrapped by `MpDeviceLoader` on XLA


Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


2021-09-14 04:03:21,444 hkim-trainer INFO: Engine run starting with max_epochs=5.


Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


2021-09-14 04:04:18,426 hkim-val_evaluator INFO: Engine run starting with max_epochs=1.
2021-09-14 04:04:24,208 hkim-val_evaluator INFO: Epoch[1] Complete. Time taken: 00:00:06
2021-09-14 04:04:24,227 hkim-val_evaluator INFO: Engine run complete. Time taken: 00:00:06
2021-09-14 04:04:24,240 hkim-val_evaluator INFO: 
Epoch 1 - Evaluation time (seconds): 5.79 - validation evaluator metrics:
 	accuracy: 0.4814
	ce: 1.40567431640625
2021-09-14 04:04:24,249 hkim-trainer INFO: Epoch[1] Complete. Time taken: 00:01:03
2021-09-14 04:04:59,893 hkim-val_evaluator INFO: Engine run starting with max_epochs=1.
2021-09-14 04:05:03,887 hkim-val_evaluator INFO: Epoch[1] Complete. Time taken: 00:00:04
2021-09-14 04:05:03,902 hkim-val_evaluator INFO: Engine run complete. Time taken: 00:00:04
2021-09-14 04:05:03,915 hkim-val_evaluator INFO: 
Epoch 1 - Evaluation time (seconds): 3.98 - validation evaluator metrics:
 	accuracy: 0.586
	ce: 1.16845517578125
2021-09-14 04:05:03,936 hkim-trainer INFO: Epoch[2] 

- Mean elapsed time for 1 epoch: 41.51933895520001


2021-09-14 04:07:00,951 hkim-val_evaluator INFO: Epoch[1] Complete. Time taken: 00:00:04


- Mean elapsed time for 1 epoch: 41.158890847399995
- Mean elapsed time for 1 epoch: 41.08939463800002
- Mean elapsed time for 1 epoch: 41.36137215320002
- Mean elapsed time for 1 epoch: 41.01508864619998


2021-09-14 04:07:00,960 hkim-val_evaluator INFO: Engine run complete. Time taken: 00:00:04
2021-09-14 04:07:00,982 hkim-val_evaluator INFO: 
Epoch 1 - Evaluation time (seconds): 3.97 - validation evaluator metrics:
 	accuracy: 0.6797
	ce: 0.92623974609375
2021-09-14 04:07:00,985 hkim-trainer INFO: Epoch[5] Complete. Time taken: 00:00:39


- Mean elapsed time for 1 epoch: 43.9040625654


2021-09-14 04:07:00,991 hkim-trainer INFO: Engine run complete. Time taken: 00:03:40


- Mean elapsed time for 1 epoch: 41.01801286100001
- Mean elapsed time for 1 epoch: 41.09010485139997


2021-09-14 04:07:01,161 ignite.distributed.launcher.Parallel INFO: End of run


## License


---


Note: This is not an official [LG AI Research](https://www.lgresearch.ai/) product but sample code provided for an educational purpose

<br/>
author: John H. Kim
<br/>  
email: john.kim@lgresearch.ai / secutron@naver.com  


---