In [1]:
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim

import torch.utils.data
from torch.utils.tensorboard import SummaryWriter
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.distributed as dist

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Utiliy Functions.
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [3]:
# Ensure that we get deterministic results.
SEED=1
random.seed(SEED)
torch.manual_seed(SEED)
cudnn.deterministic = True

In [4]:
# Setup TensorBoard
writer = SummaryWriter(log_dir="/data/logs")

In [5]:
# Check that we have a CUDA enabled device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
# Setup torch DDP
WORLD_SIZE = 2 # Number of nodes in cluster.
BACKEND = "nccl" # DDP backend.
URL = os.environ["MASTER_URL"] # Address of master node.
RANK = 0 # Rank of this node (master).
# Intialize distributed process group.
dist.init_process_group(init_method=URL, backend=BACKEND, world_size=WORLD_SIZE, rank=RANK)

In [7]:
MEAN_RGB = [0.47889522, 0.47227842, 0.43047404]
STD_RGB = [0.229, 0.224, 0.225]
IMG_SIZE = 128

In [8]:
# Load train and val dataset. 
TRAINDIR = "/data/train" # ImageNet train.
VALDIR = "/data/val" # ImageNet val.
BATCH_SIZE = 64
TRAIN_WORKERS = 6
VAL_WORKERS = 0
SHUFFLE = False

# Scale to ImageNet mean and STD since we will be using a model pretrained on ImageNet. 
transform_train = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(MEAN_RGB, STD_RGB),
])

transform_val = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(MEAN_RGB, STD_RGB),
])

# Load training data.
train_dataset = datasets.ImageFolder(
    TRAINDIR, transform=transform_train)

train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=BATCH_SIZE, num_workers=TRAIN_WORKERS, shuffle=SHUFFLE, pin_memory=True, 
        sampler=torch.utils.data.distributed.DistributedSampler(train_dataset)) # Sample data across nodes.

# Load validation data.
val_dataset = datasets.ImageFolder(
    VALDIR, transform=transform_val)

val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=BATCH_SIZE, num_workers=VAL_WORKERS, shuffle=SHUFFLE, pin_memory=True, 
        sampler=torch.utils.data.distributed.DistributedSampler(val_dataset)) # Sample data across nodes.

In [9]:
NUM_CLASSES = 1000
ARCH = 'resnet18'
LR = 1e-3
MOMENTUM = 0.9
WEIGHT_DECAY = 1e-4
# Load model from PyTorch.
model = models.__dict__[ARCH]()
inf = model.fc.in_features
# Set fully connected layer to train with 1000 classes.
model.fc = nn.Linear(inf, NUM_CLASSES)
model.to(device)
# Setup optimizer and loss function. 
optimizer = torch.optim.SGD(model.parameters(), LR,
                                momentum=MOMENTUM,
                                weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss().to(device)

In [10]:
# Set CUDA device.
GPU_DEVICE_ID = 0
torch.cuda.set_device(GPU_DEVICE_ID)

In [11]:
# Wrap model in DDP. 
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[GPU_DEVICE_ID])

In [12]:
PRINT_FREQ = BATCH_SIZE - 1
global_step = 0

In [13]:
# Define train step.
def train(train_loader, model, criterion, optimizer, epoch):
    global global_step
    # Keep progress of metrics.
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))

    # Switch to train mode
    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        # Measure data loading time.
        data_time.update(time.time() - end)
        
        # Move data to device
        images = images.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True)
            
        # Compute output.
        output = model(images)
        loss = criterion(output, target)

        # Measure accuracy and record loss.
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))
        
        # Write tensorboard logs.
        writer.add_scalar("Loss/train", loss, global_step=global_step)
        writer.add_scalar("Acc1/train", top1.avg, global_step=global_step)
        writer.add_scalar("Acc5/train", top5.avg, global_step=global_step)
        global_step += 1

        # Compute gradient.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Measure elapsed time.
        batch_time.update(time.time() - end)
        end = time.time()

        if i % PRINT_FREQ == 0:
            progress.display(i)

In [14]:
# Define validation step.
def validate(val_loader, model, criterion):
    global global_step
    # Keep progress of metrics.
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # Switch to evaluate mode.
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            
            # Move data to GPU if CUDA device is available.
            images = images.to(device, non_blocking=True)
            target = target.to(device, non_blocking=True)

            # Compute output.
            output = model(images)
            loss = criterion(output, target)

            # Measure accuracy and record loss.
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))
            

            # Measure elapsed time.
            batch_time.update(time.time() - end)
            end = time.time()

            if i % PRINT_FREQ == 0:
                progress.display(i)

        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))
    
     # Write tensorboard logs.
    writer.add_scalar("Loss/val", loss, global_step=global_step)
    writer.add_scalar("Acc1/val", top1.avg, global_step=global_step)
    writer.add_scalar("Acc5/val", top5.avg, global_step=global_step)
    
    return top1.avg

In [15]:
for epoch in range(1):
    # train for one epoch
    train(train_loader, model, criterion, optimizer, epoch)

    # evaluate on validation set
    validate(val_loader, model, criterion)
    

Epoch: [0][    0/10010]	Time  1.297 ( 1.297)	Data  0.781 ( 0.781)	Loss 6.9688e+00 (6.9688e+00)	Acc@1   0.00 (  0.00)	Acc@5   1.56 (  1.56)
Epoch: [0][   63/10010]	Time  0.070 ( 0.117)	Data  0.000 ( 0.049)	Loss 7.0033e+00 (7.0129e+00)	Acc@1   0.00 (  0.10)	Acc@5   0.00 (  0.54)
Epoch: [0][  126/10010]	Time  0.283 ( 0.109)	Data  0.260 ( 0.047)	Loss 6.9017e+00 (6.9989e+00)	Acc@1   0.00 (  0.15)	Acc@5   0.00 (  0.64)
Epoch: [0][  189/10010]	Time  0.062 ( 0.104)	Data  0.000 ( 0.044)	Loss 6.9363e+00 (6.9832e+00)	Acc@1   1.56 (  0.16)	Acc@5   1.56 (  0.65)
Epoch: [0][  252/10010]	Time  0.060 ( 0.103)	Data  0.000 ( 0.043)	Loss 6.9364e+00 (6.9719e+00)	Acc@1   0.00 (  0.16)	Acc@5   0.00 (  0.66)
Epoch: [0][  315/10010]	Time  0.211 ( 0.103)	Data  0.180 ( 0.044)	Loss 6.8801e+00 (6.9616e+00)	Acc@1   0.00 (  0.15)	Acc@5   3.12 (  0.73)
Epoch: [0][  378/10010]	Time  0.062 ( 0.103)	Data  0.000 ( 0.046)	Loss 6.8673e+00 (6.9527e+00)	Acc@1   0.00 (  0.19)	Acc@5   0.00 (  0.79)
Epoch: [0][  441/10010]	Tim

Epoch: [0][ 3717/10010]	Time  0.137 ( 0.096)	Data  0.112 ( 0.038)	Loss 6.2300e+00 (6.5915e+00)	Acc@1   1.56 (  0.99)	Acc@5   6.25 (  3.52)
Epoch: [0][ 3780/10010]	Time  0.076 ( 0.096)	Data  0.000 ( 0.038)	Loss 6.1139e+00 (6.5843e+00)	Acc@1   1.56 (  1.01)	Acc@5   6.25 (  3.59)
Epoch: [0][ 3843/10010]	Time  0.226 ( 0.096)	Data  0.204 ( 0.038)	Loss 5.9648e+00 (6.5780e+00)	Acc@1   4.69 (  1.02)	Acc@5   4.69 (  3.64)
Epoch: [0][ 3906/10010]	Time  0.061 ( 0.096)	Data  0.000 ( 0.038)	Loss 6.1400e+00 (6.5715e+00)	Acc@1   4.69 (  1.04)	Acc@5  10.94 (  3.69)
Epoch: [0][ 3969/10010]	Time  0.149 ( 0.096)	Data  0.129 ( 0.038)	Loss 6.0843e+00 (6.5645e+00)	Acc@1   3.12 (  1.06)	Acc@5   7.81 (  3.75)
Epoch: [0][ 4032/10010]	Time  0.084 ( 0.096)	Data  0.000 ( 0.038)	Loss 6.0619e+00 (6.5578e+00)	Acc@1   3.12 (  1.07)	Acc@5   7.81 (  3.80)
Epoch: [0][ 4095/10010]	Time  0.065 ( 0.096)	Data  0.000 ( 0.038)	Loss 6.0535e+00 (6.5512e+00)	Acc@1   4.69 (  1.09)	Acc@5   6.25 (  3.85)
Epoch: [0][ 4158/10010]	Tim

Epoch: [0][ 7434/10010]	Time  0.071 ( 0.095)	Data  0.000 ( 0.036)	Loss 5.6031e+00 (6.2440e+00)	Acc@1   0.00 (  2.07)	Acc@5  12.50 (  6.80)
Epoch: [0][ 7497/10010]	Time  0.206 ( 0.095)	Data  0.160 ( 0.036)	Loss 5.5132e+00 (6.2389e+00)	Acc@1   9.38 (  2.09)	Acc@5  17.19 (  6.86)
Epoch: [0][ 7560/10010]	Time  0.065 ( 0.095)	Data  0.000 ( 0.036)	Loss 6.0552e+00 (6.2341e+00)	Acc@1   0.00 (  2.11)	Acc@5   4.69 (  6.91)
Epoch: [0][ 7623/10010]	Time  0.244 ( 0.095)	Data  0.222 ( 0.036)	Loss 5.6354e+00 (6.2290e+00)	Acc@1   1.56 (  2.13)	Acc@5   9.38 (  6.97)
Epoch: [0][ 7686/10010]	Time  0.213 ( 0.095)	Data  0.187 ( 0.036)	Loss 6.3825e+00 (6.2244e+00)	Acc@1   4.69 (  2.14)	Acc@5   9.38 (  7.02)
Epoch: [0][ 7749/10010]	Time  0.068 ( 0.095)	Data  0.001 ( 0.036)	Loss 5.7323e+00 (6.2196e+00)	Acc@1   4.69 (  2.16)	Acc@5  15.62 (  7.08)
Epoch: [0][ 7812/10010]	Time  0.247 ( 0.095)	Data  0.222 ( 0.036)	Loss 5.6324e+00 (6.2147e+00)	Acc@1   6.25 (  2.18)	Acc@5  15.62 (  7.13)
Epoch: [0][ 7875/10010]	Tim