# Template Code for PyTorch

1. Normal Template Code
2. Distributed/Parallel Template Code

<div class="alert alert-block alert-info">
1 -> 2 Transformation Methods
</div>

## Load Libraries

### Load Basic Modules

In [None]:
from __future__ import print_function

### Load PyTorch Modules

In [48]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

### Load Module to get arguments

In [None]:
import argparse

### Load PyTorch Modules for distributed/parallel

In [None]:
import torch.multiprocessing as mp
import torch.utils.data.distributed
import horovod.torch as hvd

### Make Network Class

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)

### Hyperparameters Settings

In [None]:
##### TODO: This is arguments set #####
def set_hyperparameters():
    ##### CUDA SETTING #####
    no_cuda = False
    cuda = not no_cuda and torch.cuda.is_available()
    #########################
    batch_size = 64
    test_batch_size = 128
    epochs = 10    
    momentum = 0.5
    lr = 0.01 # learning rate
    log_interval = 10
    # For Parallel/Distributed
    seed = 42
    use_adasum = False
    ##########################
    use_horovod = False

### Horovod Settings

In [None]:
# Horovod: initialize library.
##### HOROVOD #####
hvd.init()
torch.manual_seed(seed)

if cuda:
    # Horovod: pin GPU to local rank.
    ##### HOROVOD #####
    torch.cuda.set_device(hvd.local_rank())
    ##### TODO:Need argument #####
    #torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed(seed)

# Horovod: limit # of CPU threads to be used per worker.
torch.set_num_threads(1)

### Define Fit Method

In [None]:
def train(epoch):
    model.train()
    ##### HOROVOD ##### --- train_sampler, optimizer wrapper
    # Horovod: set epoch to sampler for shuffling.
    train_sampler.set_epoch(epoch)
    for batch_idx, (data, target) in enumerate(train_loader):
        if cuda:
            data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            ##### HOROVOD ##### --- train_sampler
            # Horovod: use train_sampler to determine the number of examples in
            # this worker's partition.
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_sampler),
                100. * batch_idx / len(train_loader), loss.item()))

### Normal Version

In [None]:
def fit(epochs, model, loss_func, optimizer, train_loader):
    for epoch in range(epochs):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = loss_func(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
            ##### HOROVOD ##### --- train_sampler
            # Horovod: use train_sampler to determine the number of examples in
            # this worker's partition.
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_sampler),
                100. * batch_idx / len(train_loader), loss.item()))

### Metric Average

In [None]:
def metric_average(val, name):
    tensor = torch.tensor(val)
    ##### HOROVOD ##### -- allreduce (tensor average)
    avg_tensor = hvd.allreduce(tensor, name=name)    
    return avg_tensor.item()

### Test Function (Horovod)

In [None]:
def test():
    model.eval()
    test_loss = 0.
    test_accuracy = 0.
    for data, target in test_loader:
        if cuda:
            data, target = data.cuda(), target.cuda()
        output = model(data)
        # sum up batch loss
        test_loss += F.nll_loss(output, target, size_average=False).item()
        # get the index of the max log-probability
        pred = output.data.max(1, keepdim=True)[1]
        test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()

    ##### HOROVOD #####
    # Horovod: use test_sampler to determine the number of examples in
    # this worker's partition.
    test_loss /= len(test_sampler)
    test_accuracy /= len(test_sampler)

    ##### HOROVOD #####
    # Horovod: average metric values across workers.
    test_loss = metric_average(test_loss, 'avg_loss')
    test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

    # Horovod: print output only on first rank.
    ##### HOROVOD #####
    if hvd.rank() == 0:
        print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
            test_loss, 100. * test_accuracy))

### Test Function(Normal)

In [None]:
def test():
    model.eval()
    test_loss = 0.
    test_accuracy = 0.
    for data, target in test_loader:        
        output = model(data)
        # sum up batch loss
        test_loss += F.nll_loss(output, target, size_average=False).item()
        # get the index of the max log-probability
        pred = output.data.max(1, keepdim=True)[1]
        test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()

    ##### HOROVOD #####
    # Horovod: use test_sampler to determine the number of examples in
    # this worker's partition.
    test_loss /= len(test_sampler)
    test_accuracy /= len(test_sampler)

    ##### HOROVOD #####
    # Horovod: average metric values across workers.
    test_loss = metric_average(test_loss, 'avg_loss')
    test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

    # Horovod: print output only on first rank.
    ##### HOROVOD #####
    if hvd.rank() == 0:
        print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
            test_loss, 100. * test_accuracy))

### main function

In [None]:
if __name__ == '__main__':
    set_hyperparameters()
    kwargs = {}
    if use_horovod:
        kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
        # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
        # issues with Infiniband implementations that are not fork-safe
        if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
                mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
            kwargs['multiprocessing_context'] = 'forkserver'
            
    if use_horovod:
        ##### HOROVOD #####
        train_dataset = \
        datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))
        ##### HOROVOD #####
        # Horovod: use DistributedSampler to partition the training data.
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
        ##### TODO:Need argument #####
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs)
        ##### HOROVOD #####
        test_dataset = \
            datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))
            ]))
        ##### HOROVOD #####
        # Horovod: use DistributedSampler to partition the test data.
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
        ##### TODO:Need argument #####
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                                  sampler=test_sampler, **kwargs)
    else:
        download_root = 'data'
        train_loader = DataLoader(dataset=train_dataset,
                         batch_size=batch_size,
                         shuffle=True)
        test_dataset = datasets.MNIST(download_root, transform=mnist_transform, train=False, download=True)
        valid_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         shuffle=True)
        test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         shuffle=False)

    model = Net()

    ##### HOROVOD #####
    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        ##### TODO:Need argument #####
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    ##### TODO:Need argument #####
    '''
    optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler,
                          momentum=args.momentum)'''
    optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler,
                          momentum=momentum)

    ##### HOROVOD #####
    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    ##### HOROVOD #####
    # Horovod: (optional) compression algorithm.
    # compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
    compression = hvd.Compression.none

    ##### HOROVOD #####
    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(optimizer,
                                         named_parameters=model.named_parameters(),
                                         compression=compression,
                                         op=hvd.Adasum if use_adasum else hvd.Average)

    for epoch in range(1, epochs + 1):
        train(epoch)
        test()

# Test Codes Below

### Data Preprocessing

PyTorch Tensor dataset is composed of x(input), y(output/label) to use dataset easily.

In [70]:
from torchvision import datasets, transforms
download_root = 'test-data'
mnist_transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST(download_root, transform=mnist_transform, train=True, download=True)

In [71]:
train_dataset

Dataset MNIST
    Number of datapoints: 60000
    Root location: test-data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )

In [91]:
type(train_dataset.train_data)



torch.Tensor

### DataLoader
PyTorch DataLoader is used to loop data easily.
A loop contains dataset with mini batch size.

In [72]:
from torch.utils.data import DataLoader
batch_size = 64
train_loader = DataLoader(dataset=train_dataset,
                         batch_size=batch_size,
                         shuffle=True)
test_dataset = datasets.MNIST(download_root, transform=mnist_transform, train=False, download=True)
valid_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         shuffle=True)
test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         shuffle=False)

In [73]:
print(train_loader)
print(len(train_loader))
print(batch_size * len(train_loader))
print(len(train_loader.dataset))
print(test_loader)
print(len(test_loader))
print(batch_size * len(test_loader))
print(len(test_loader.dataset))

<torch.utils.data.dataloader.DataLoader object at 0x7f0bbc06ef90>
938
60032
60000
<torch.utils.data.dataloader.DataLoader object at 0x7f0bbc06e8d0>
157
10048
10000


In [74]:
for idx, (xb, yb) in enumerate(train_loader):
    if idx < 3:
        print(idx)
        print(xb.shape)
        print(yb.shape)
    if idx > 935:
        print(idx)
        print(xb.shape)
        print(yb.shape)
for idx, (xb, yb) in enumerate(test_loader):
    if idx < 3:
        print(idx)
        print(xb.shape)
        print(yb.shape)
    if idx > 154:
        print(idx)
        print(xb.shape)
        print(yb.shape)

0
torch.Size([64, 1, 28, 28])
torch.Size([64])
1
torch.Size([64, 1, 28, 28])
torch.Size([64])
2
torch.Size([64, 1, 28, 28])
torch.Size([64])
936
torch.Size([64, 1, 28, 28])
torch.Size([64])
937
torch.Size([32, 1, 28, 28])
torch.Size([32])
0
torch.Size([64, 1, 28, 28])
torch.Size([64])
1
torch.Size([64, 1, 28, 28])
torch.Size([64])
2
torch.Size([64, 1, 28, 28])
torch.Size([64])
155
torch.Size([64, 1, 28, 28])
torch.Size([64])
156
torch.Size([16, 1, 28, 28])
torch.Size([16])


### Make Network

In [75]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x,dim=1)

### Define Getting Model Function because optimizer need model parameters
optimizer --- SGD, AdaGrad, Momentum, Adam, RMSProp, ...

optimizer is for calculating gradient to update weights

In [76]:
from torch import optim

lr = 0.01 # learning rate

def get_model():
    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    return model, optimizer

In [77]:
model, optimizer = get_model()

### Define Loss Function

In [78]:
loss_func = F.nll_loss

In [79]:
loss_func

<function torch.nn.functional.nll_loss(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')>

### Fit Method

check trainloader length

In [80]:
print(len(train_loader))
print(len(train_loader.dataset))

938
60000


In [81]:
def fit(epochs, model, loss_func, optimizer, train_loader):
    for epoch in range(epochs):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = loss_func(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch+1, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

### Test Function

In [85]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

def test():
    model.eval()
    test_loss = 0.
    test_accuracy = 0.    
    for idx, (data, target) in enumerate(test_loader):
        output = model(data)
        local_loss = loss_func(output, target)
        local_accuracy = accuracy(output,target)
        test_loss += local_loss * len(data)
        test_accuracy += local_accuracy * len(data)
    test_loss /= len(test_loader.dataset)
    test_accuracy /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'
          .format(test_loss,100.* test_accuracy))    

In [86]:
len(test_loader)
#test()

157

In [87]:
epochs = 5
log_interval = 50

loss_log = []
fit(epochs, model, loss_func, optimizer, train_loader)
test()


Test set: Average loss: 0.0713, Accuracy: 97.74%



### Data Load for Horovod

In [None]:
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
        # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
        # issues with Infiniband implementations that are not fork-safe
if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
    mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
    kwargs['multiprocessing_context'] = 'forkserver'
##### HOROVOD #####
train_dataset = \
    datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))
##### HOROVOD #####
# Horovod: use DistributedSampler to partition the training data.
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
##### TODO:Need argument #####
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs)
##### HOROVOD #####
test_dataset = \
    datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))
    ]))
##### HOROVOD #####
# Horovod: use DistributedSampler to partition the test data.
test_sampler = torch.utils.data.distributed.DistributedSampler(
               test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
##### TODO:Need argument #####
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          sampler=test_sampler, **kwargs)

### Free Test Codes Below

In [None]:
ziplist = [loss_batch(model, loss_func, xb, yb) for xb, yb in test_loader]
len(ziplist)
# print(*ziplist)
# print(ziplist)
losses, nums = zip(*ziplist)
print(losses)
print(nums)

In [44]:
import numpy as np

# This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
# Output Layer & Loss Function
loss_func = F.nll_loss

# for xb, yb in test_loader:
#     print("Output Size:{:d}, Label Size:{:d}".format(len(model(xb)),len(yb)))

def loss_batch(model, loss_func, xb, yb, opt=None):    
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)

lb = [loss_batch(model, loss_func, xb, yb) for xb, yb in test_loader]
losses, num = zip(*lb)
# print(lb)
print('-----losses-----')
print(losses)
print('-----num-----')
print(num)
test_loss = np.sum(np.multiply(losses, num)) / np.sum(num)
test_loss2 = np.sum(losses)
print('-----sum of num-----')
print(np.sum(num))
print("Test Loss: {:f}".format(test_loss))
print("Test Loss: {:f}".format(test_loss2))



-----losses-----
(2.319767475128174, 2.309004068374634, 2.3203701972961426, 2.2952890396118164, 2.3022823333740234, 2.317312479019165, 2.2929506301879883, 2.3257343769073486, 2.316655158996582, 2.3386337757110596, 2.3223235607147217, 2.292008876800537, 2.3388988971710205, 2.2959821224212646, 2.3093717098236084, 2.2928993701934814, 2.3463830947875977, 2.3816003799438477, 2.3068344593048096, 2.3595120906829834, 2.3470187187194824, 2.319748878479004, 2.2877745628356934, 2.31534743309021, 2.3542819023132324, 2.3139638900756836, 2.3232228755950928, 2.348968744277954, 2.320908308029175, 2.3423118591308594, 2.310112953186035, 2.2821013927459717, 2.32356595993042, 2.3086283206939697, 2.2934176921844482, 2.33429217338562, 2.283425807952881, 2.321873426437378, 2.282600164413452, 2.360349178314209, 2.316889524459839, 2.3511197566986084, 2.269629955291748, 2.317993402481079, 2.3207175731658936, 2.319084405899048, 2.3116507530212402, 2.3352508544921875, 2.3578150272369385, 2.3142197132110596, 2.301

In [50]:
import torch

aaa = np.arange(0,10)
bbb = torch.tensor(aaa)

In [52]:
torch.argmax(bbb,dim=1)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [61]:
for idx, (xb, yb) in enumerate(test_loader):
    if idx == 0:
        preds = model(xb)
        argmax = torch.argmax(preds,dim=1)
        print(argmax)
        print(preds)
        print(preds.shape)

tensor([6, 0, 1, 0, 0, 3, 1, 0, 0, 0, 8, 6, 1, 1, 0, 3, 0, 0, 0, 2, 7, 0, 0, 3,
        1, 0, 2, 3, 4, 9, 3, 0, 0, 0, 0, 1, 2, 1, 0, 0, 9, 1, 1, 0, 2, 0, 2, 0,
        2, 0, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 9, 0, 3, 0],
       grad_fn=<NotImplemented>)
tensor([[-2.2407, -2.2118, -2.3539, -2.2382, -2.4272, -2.4221, -2.1456, -2.4964,
         -2.2202, -2.3283],
        [-1.9271, -2.0202, -2.1700, -2.0995, -2.5577, -2.3716, -2.7399, -2.8670,
         -2.1804, -2.5277],
        [-2.1736, -2.1110, -2.2403, -2.4793, -2.4560, -2.5213, -2.1798, -2.2833,
         -2.3858, -2.2854],
        [-2.0224, -2.3911, -2.2722, -2.2848, -2.3098, -2.6901, -2.1472, -2.1753,
         -2.3258, -2.5770],
        [-2.1855, -2.2223, -2.2133, -2.2657, -2.4246, -2.3712, -2.2962, -2.4516,
         -2.2840, -2.3479],
        [-2.0782, -2.0930, -2.2328, -2.0661, -2.3032, -2.3943, -2.7454, -2.4158,
         -2.5766, -2.3342],
        [-2.3819, -2.1083, -2.2407, -2.2821, -2.3821, -2.4858, -2.1300, -2.2365,
         -2.474