# 노트북에서 Training Operator를 활용하여 병렬 학습 모델 구현하기

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from kakaocloud_kbm.training import TrainingClient
from kakaocloud_kbm.training.utils.utils import get_default_target_namespace

## Fashion MNIST CNN 모델 학습 함수 선언

- Training Job에 넣어줄 간단한 CNN 모델 학습 함수를 선언합니다
- torchvision 패키지를 통해 Fashion MNIST 데이터를 다운로드 코드 포함

In [2]:
def train_pytorch_model():
    import logging
    import os
    from torchvision import transforms, datasets
    import torch
    from torch import nn
    import torch.nn.functional as F
    import torch.distributed as dist

    logging.basicConfig(
        format="%(asctime)s %(levelname)-8s %(message)s",
        datefmt="%Y-%m-%dT%H:%M:%SZ",
        level=logging.DEBUG,
    )

    # Create PyTorch CNN Model.
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 20, 5, 1)
            self.conv2 = nn.Conv2d(20, 50, 5, 1)
            self.fc1 = nn.Linear(4 * 4 * 50, 500)
            self.fc2 = nn.Linear(500, 10)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            x = F.max_pool2d(x, 2, 2)
            x = F.relu(self.conv2(x))
            x = F.max_pool2d(x, 2, 2)
            x = x.view(-1, 4 * 4 * 50)
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return F.log_softmax(x, dim=1)

    # Get dist parameters.
    # Kubeflow Training Operator automatically set appropriate RANK and WORLD_SIZE based on the configuration.
    RANK = int(os.environ["RANK"])
    WORLD_SIZE = int(os.environ["WORLD_SIZE"])

    model = Net()
    # Attach model to DistributedDataParallel strategy.
    dist.init_process_group(backend="gloo", rank=RANK, world_size=WORLD_SIZE)
    Distributor = nn.parallel.DistributedDataParallel
    model = Distributor(model)

    # Split batch size for each worker.
    batch_size = int(128 / WORLD_SIZE)

    # Get Fashion MNIST DataSet.
    train_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(
            "./data",
            train=True,
            download=True,
            transform=transforms.Compose([transforms.ToTensor()]),
        ),
        batch_size=batch_size,
    )

    # Start Training.
    logging.info(f"Start training for RANK: {RANK}. WORLD_SIZE: {WORLD_SIZE}")
    for epoch in range(1):
        model.train()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 10 == 0:
                logging.info(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

## Training Job 실행

In [3]:
# VARIABLES
my_namespace = get_default_target_namespace()
pytorchjob_name = "parallel-train-pytorch"
gpu_mig_for_1ea = {
    "nvidia.com/mig-1g.10gb": "1",
    "cpu": "1",
    "memory": "2G"

}
num_workers = 2  # 남는 MIG 인스턴스 대수 -1 만큼 변경(1대는 master, n대는 worker에서 사용)

In [4]:
training_client = TrainingClient()

training_client.create_pytorchjob_from_func(
    name=pytorchjob_name,
    namespace=my_namespace,
    func=train_pytorch_model,
    num_worker_replicas=num_workers,
    limit_resources=gpu_mig_for_1ea
)

[I 240115 15:00:25 utils:97] PyTorchJob kbm-u-jkson/parallel-train-pytorch has been created


## Training Job 상태 확인

In [12]:
# STATUS DETAILS
print(training_client.get_job_conditions(name=pytorchjob_name, job_kind='PyTorchJob'))

# RUN CHECK
print(f"Is job running: {training_client.is_job_running(name=pytorchjob_name, job_kind='PyTorchJob')}")

[{'last_transition_time': datetime.datetime(2024, 1, 15, 6, 0, 25, tzinfo=tzutc()),
 'last_update_time': datetime.datetime(2024, 1, 15, 6, 0, 25, tzinfo=tzutc()),
 'message': 'PyTorchJob parallel-train-pytorch is created.',
 'reason': 'PyTorchJobCreated',
 'status': 'True',
 'type': 'Created'}, {'last_transition_time': datetime.datetime(2024, 1, 15, 6, 2, 17, tzinfo=tzutc()),
 'last_update_time': datetime.datetime(2024, 1, 15, 6, 2, 17, tzinfo=tzutc()),
 'message': 'PyTorchJob parallel-train-pytorch is running.',
 'reason': 'JobRunning',
 'status': 'True',
 'type': 'Running'}]
Is job running: True


## 학습 Pod 확인

In [11]:
training_client.get_job_pod_names(pytorchjob_name)

['parallel-train-pytorch-master-0',
 'parallel-train-pytorch-worker-0',
 'parallel-train-pytorch-worker-1']

## 로그 출력

In [16]:
training_client.get_job_logs(pytorchjob_name, container="pytorch")

[I 230615 18:53:48 training_client:577] The logs of pod parallel-train-pytorch-master-0:
     2023-06-15T09:26:13Z INFO     Added key: store_based_barrier_key:1 to store for rank: 0
    2023-06-15T09:26:13Z INFO     Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 3 nodes.
    Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
    Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz
100%|██████████| 26421880/26421880 [00:03<00:00, 6743589.41it/s] 
    Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw
    
    Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
    Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz
100%|██████████| 2

## Training Job 삭제

In [17]:
training_client.delete_pytorchjob(pytorchjob_name)

[I 230615 18:53:54 utils:187] PyTorchJob kbm-your-namespace/parallel-train-pytorch has been deleted
