# Benchmarking a basic training of a CNN

In [1]:
import time
import random
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 128
num_iters = 10
device = 0


class SyntheticDataset(Dataset):
    def __getitem__(self, idx):
        data = torch.randn(3, 224, 224)
        target = random.randint(0, 999)
        return (data, target)

    def __len__(self):
        return batch_size * num_iters
    

train_set = SyntheticDataset()

train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=False,
    num_workers=12
)

In [3]:
model = models.resnet50()
model.to(device);

In [4]:
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [5]:
def benchmark_step(model, imgs, labels):
    optimizer.zero_grad()
    output = model(imgs)
    loss = F.cross_entropy(output, labels)
    loss.backward()
    optimizer.step()

In [6]:
num_epochs = 5
imgs_sec = []
for epoch in range(num_epochs):
    t0 = time.time()
    for step, (imgs, labels) in enumerate(train_loader):
        imgs = imgs.to(device)
        labels = labels.to(device)
        benchmark_step(model, imgs, labels)

    dt = time.time() - t0
    imgs_sec.append(batch_size * num_iters / dt)

    print(f' * Epoch {epoch:2d}: '
          f'{imgs_sec[epoch]:.2f} images/sec per GPU')

 * Epoch  0: 167.32 images/sec per GPU
 * Epoch  1: 220.69 images/sec per GPU
 * Epoch  2: 220.48 images/sec per GPU
 * Epoch  3: 220.67 images/sec per GPU
 * Epoch  4: 220.88 images/sec per GPU


## Now change the above scripts to DistributedDataParallel

In [1]:
import ipcmagic

In [2]:
%ipcluster start -n 2

100%|██████████| 2/2 [00:24<00:00, 12.21s/engine]


In [3]:
%pxconfig --progress-after -1

In [4]:
%%px
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, Dataset, DistributedSampler
from pt_distr_env import DistributedEnviron

  from .autonotebook import tqdm as notebook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
%%px
distr_env = DistributedEnviron()
dist.init_process_group(backend="nccl")
world_size = dist.get_world_size()
rank = dist.get_rank()
device = distr_env.local_rank

[stderr:0] [W socket.cpp:401] [c10d] The server socket cannot be initialized on [::]:39591 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03172]:39591 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03172]:39591 (errno: 97 - Address family not supported by protocol).


[stderr:1] [W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03172]:39591 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:558] [c10d] The client socket cannot be initialized to connect to [nid03172]:39591 (errno: 97 - Address family not supported by protocol).


In [7]:
%%px

batch_size_per_gpu = 128
num_iters = 10
device = 0


class SyntheticDataset(Dataset):
    def __getitem__(self, idx):
        data = torch.randn(3, 224, 224)
        target = random.randint(0, 999)
        return (data, target)

    def __len__(self):
        return batch_size_per_gpu * num_iters * world_size
    

train_set = SyntheticDataset()
print(len(train_set))

train_sampler = DistributedSampler(
    train_set,
    num_replicas=world_size,
    rank=rank,
    shuffle=False,
    seed=42
)

train_loader = DataLoader(
    train_set,
    batch_size=batch_size_per_gpu,
    shuffle=False,
    num_workers=12,
    sampler=train_sampler
)

[stdout:1] 2560


[stdout:0] 2560


In [8]:
%%px
import time
import random
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models

In [9]:
%%px

_model = models.resnet50()
_model.to(device)
ddp_model = DistributedDataParallel(_model, device_ids=[device])

[stderr:0] libibverbs: Could not locate libibgni (/usr/lib64/libibgni.so.1: undefined symbol: verbs_uninit_context)


[stderr:1] libibverbs: Could not locate libibgni (/usr/lib64/libibgni.so.1: undefined symbol: verbs_uninit_context)


In [10]:
%%px
optimizer = optim.SGD(ddp_model.parameters(), lr=0.01)

In [11]:
%%px
def benchmark_step(model, imgs, labels):
    optimizer.zero_grad()
    output = model(imgs)
    loss = F.cross_entropy(output, labels)
    loss.backward()
    optimizer.step()

In [12]:
%%px

num_epochs = 5
imgs_sec = []
for epoch in range(num_epochs):
    t0 = time.time()
    for step, (imgs, labels) in enumerate(train_loader):
        imgs = imgs.to(device)
        labels = labels.to(device)
        benchmark_step(ddp_model, imgs, labels)

    dt = time.time() - t0
    imgs_sec.append(batch_size_per_gpu * num_iters / dt)

    print(f' * Epoch {epoch:2d}: '
          f'{imgs_sec[epoch]:.2f} images/sec per GPU')

[stdout:0]  * Epoch  0: 160.79 images/sec per GPU
 * Epoch  1: 212.97 images/sec per GPU
 * Epoch  2: 212.14 images/sec per GPU
 * Epoch  3: 212.06 images/sec per GPU
 * Epoch  4: 212.28 images/sec per GPU


[stdout:1]  * Epoch  0: 160.74 images/sec per GPU
 * Epoch  1: 213.02 images/sec per GPU
 * Epoch  2: 212.23 images/sec per GPU
 * Epoch  3: 211.92 images/sec per GPU
 * Epoch  4: 212.26 images/sec per GPU


In [13]:
%ipcluster stop

IPCluster stopped.
