<a href="https://colab.research.google.com/github/secutron/TesTime/blob/main/ig_RC_osmu_v001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RC(Randomly Controled) 컨셉 테스트



In [1]:
from datetime import datetime
print("Current Date/Time: ", datetime.now())

Current Date/Time:  2021-05-07 07:45:47.472965


## Ignite 준비

In [None]:
!pip install pytorch-ignite

Collecting pytorch-ignite
[?25l  Downloading https://files.pythonhosted.org/packages/f8/d3/640f70d69393b415e6a29b27c735047ad86267921ad62682d1d756556d48/pytorch_ignite-0.4.4-py3-none-any.whl (200kB)
[K     |█▋                              | 10kB 14.0MB/s eta 0:00:01[K     |███▎                            | 20kB 11.8MB/s eta 0:00:01[K     |█████                           | 30kB 9.3MB/s eta 0:00:01[K     |██████▌                         | 40kB 7.9MB/s eta 0:00:01[K     |████████▏                       | 51kB 5.2MB/s eta 0:00:01[K     |█████████▉                      | 61kB 5.8MB/s eta 0:00:01[K     |███████████▌                    | 71kB 5.7MB/s eta 0:00:01[K     |█████████████                   | 81kB 6.2MB/s eta 0:00:01[K     |██████████████▊                 | 92kB 5.8MB/s eta 0:00:01[K     |████████████████▍               | 102kB 6.2MB/s eta 0:00:01[K     |██████████████████              | 112kB 6.2MB/s eta 0:00:01[K     |███████████████████▋            | 122kB

In [None]:
import os

in_colab = "COLAB_TPU_ADDR" in os.environ
with_torch_launch = "WORLD_SIZE" in os.environ

if in_colab:
    # https://github.com/pytorch/builder/pull/750
    VERSION = "20200607" 
    #VERSION = "nightly"
    !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
    !python pytorch-xla-env-setup.py --version $VERSION

In [None]:
from pathlib import Path

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torchvision import datasets, models
from torchvision.transforms import Compose, Normalize, Pad, RandomCrop, RandomHorizontalFlip, ToTensor

import ignite.distributed as idist
from ignite.contrib.engines import common
from ignite.contrib.handlers import ProgressBar
from ignite.engine import Engine, Events, create_supervised_evaluator
from ignite.metrics import Accuracy

In [None]:
train_transform = Compose(
    [
        Pad(4),
        RandomCrop(32, fill=128),
        RandomHorizontalFlip(),
        ToTensor(),
        Normalize((0.485, 0.456, 0.406), (0.229, 0.23, 0.225)),
    ]
)

test_transform = Compose([ToTensor(), Normalize((0.485, 0.456, 0.406), (0.229, 0.23, 0.225)),])

In [None]:
def get_train_test_datasets(path):
    train_ds = datasets.CIFAR10(root=path, train=True, download=True, transform=train_transform)
    test_ds = datasets.CIFAR10(root=path, train=False, download=False, transform=test_transform)

    return train_ds, test_ds

In [None]:
def get_model(name):
    if name in models.__dict__:
        fn = models.__dict__[name]
    else:
        raise RuntimeError(f"Unknown model name {name}")

    return fn(num_classes=10)

In [None]:
def get_dataflow(config):

    if idist.get_rank() > 0: # 마스터 노드만 데이터 다운로드
        idist.barrier()

    train_dataset, test_dataset = get_train_test_datasets(config.get("data_path", "."))

    if idist.get_rank() == 0: # 마스터 노드만 데이터 다운로드 
        idist.barrier()

    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config.get("batch_size", 512),
        num_workers=config.get("num_workers", 8),
        shuffle=True,
        drop_last=True,
    )
    config["num_iters_per_epoch"] = len(train_loader)

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config.get("batch_size", 512),
        num_workers=config.get("num_workers", 8),
        shuffle=False,
    )
    return train_loader, test_loader

In [None]:
def initialize(config):
    
    model = get_model(config["model"])
    model = idist.auto_model(model)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("learning_rate", 0.1),
        momentum=config.get("momentum", 0.9),
        weight_decay=config.get("weight_decay", 1e-5),
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    lr_scheduler = StepLR(optimizer, step_size=le, gamma=0.9)

    return model, optimizer, criterion, lr_scheduler

In [None]:
def create_trainer(model, optimizer, criterion, lr_scheduler, config):

    def train_step(engine, batch):
        x, y = batch[0].to(idist.device()), batch[1].to(idist.device())

        model.train()
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        return loss.item()

    trainer = Engine(train_step)

    if idist.get_rank() == 0:
        @trainer.on(Events.ITERATION_COMPLETED(every=200))
        def save_checkpoint():
            fp = Path(config.get("output_path", "output")) / "checkpoint.pt"
            torch.save(model.state_dict(), fp)

        # 진행 표시
        ProgressBar().attach(trainer, output_transform=lambda x: {"batch loss": x})

    return trainer

In [None]:
def training(local_rank, config):

    # 데이터 로더 처리
    train_loader, val_loader = get_dataflow(config)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # 트레이너, 이밸류에이터 처리
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, config)
    evaluator = create_supervised_evaluator(model, metrics={"accuracy": Accuracy()}, device=idist.device())

    # 에폭 관련 이벤트
    @trainer.on(Events.EPOCH_COMPLETED(every=3))
    def evaluate_model():
        state = evaluator.run(val_loader)
        if idist.get_rank() == 0:
            print(state.metrics)

    # 로그 처리
    if idist.get_rank() == 0:
        tb_logger = common.setup_tb_logging(
            config.get("output_path", "output"), trainer, optimizer, evaluators={"validation": evaluator},
        )

    # 트레이터 실행
    trainer.run(train_loader, max_epochs=config.get("max_epochs", 3))

    if idist.get_rank() == 0:
        tb_logger.close()

In [None]:

# xxx
if __name__ == "__main__" and not (in_colab or with_torch_launch):

    backend = None  # or "nccl", "gloo", "xla-tpu" ...
    nproc_per_node = None  # or N to spawn N processes
    config = {
        "model": "resnet18",
        "dataset": "cifar10",
    }

    with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node) as parallel:
        parallel.run(training, config)


# GPU
if __name__ == "__main__" and with_torch_launch:

    backend = "nccl"  # or "nccl", "gloo", "xla-tpu" ...
    nproc_per_node = None  # or N to spawn N processes
    config = {
        "model": "resnet18",
        "dataset": "cifar10",
    }

    with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node) as parallel:
        parallel.run(training, config)


# TPU
if in_colab:
    
    backend = "xla-tpu"  # or "nccl", "gloo", "xla-tpu" ...
    #nproc_per_node = 2  # xxx
    nproc_per_node = 8  # 8 TPU (장당 4코어, 2장까지 지원)
    config = {
        "model": "resnet18",
        "dataset": "cifar10",
    }

    with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node) as parallel:
        parallel.run(training, config)

In [None]:
%debug

In [None]:
%load_ext tensorboard
%tensorboard --logdir=output