# Tutorial 10

### Today's Topic:
* Batch Normalization
* Residual Neural Network
* Pytorch utilizing GPU speedup 
* MGCF cluster resources for running gpu calculations



## Batch Normalization
Batch normalization (also known as batch norm) is a method used to make artificial neural networks faster and more stable through normalization of the layers' inputs by re-centering and re-scalin
Documentation: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html <br>
expected input of size (N, C, H, W) <br>
the Batch Normalization is done over the C dimension, computing statistics on (N, H, W) slices

In [1]:
import torch
from torch import nn
inp = torch.randn(20, 100, 35, 45)
bn = nn.BatchNorm2d(100)
output=bn(inp)
output.shape

torch.Size([20, 100, 35, 45])

In [4]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv = nn.ModuleList([nn.Conv2d(1,6,kernel_size=3,padding=1), #before pooling (B,6,32,32)
                                  nn.Conv2d(6,24,kernel_size=3,padding=1), # (B,24,16,16)
                                  nn.Conv2d(24,12,kernel_size=5)]) # (B,12,4,4)
        self.pooling = nn.MaxPool2d(kernel_size=2)
        self.fc = nn.ModuleList([nn.Linear(192,192),nn.Linear(192,10)])
        self.activation = nn.ReLU()
        self.bn = [nn.BatchNorm2d(6),nn.BatchNorm2d(24),nn.BatchNorm2d(12)]
        
    def forward(self, x):
        for i in range(2):
            x = self.pooling(self.activation(self.bn[i](self.conv[i](x))))
        x = nn.Flatten()(self.activation(self.bn[2](self.conv[2](x))))
        x = self.activation(self.fc[0](x))
        x = nn.Softmax(dim=-1)(self.fc[1](x))
        return x
    

In [5]:
cnn = CNN()
print(cnn)

CNN(
  (conv): ModuleList(
    (0): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Conv2d(6, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (2): Conv2d(24, 12, kernel_size=(5, 5), stride=(1, 1))
  )
  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): ModuleList(
    (0): Linear(in_features=192, out_features=192, bias=True)
    (1): Linear(in_features=192, out_features=10, bias=True)
  )
  (activation): ReLU()
)


In [7]:
cnn(torch.randn(20,1,32,32)).shape

torch.Size([20, 10])

## Residual Neural Network

### Additive vs concatenative skip connections

![](Additive-skip-connections-vs-concatenative-skip-connections-Rectangles-represent-data.png)


In [8]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv = nn.ModuleList([nn.Conv2d(1,6,kernel_size=3,padding=1), #before pooling (B,6,32,32)
                                  nn.Conv2d(6,24,kernel_size=3,padding=1), # (B,24,16,16)
                                  nn.Conv2d(24,12,kernel_size=5)]) # (B,12,4,4)
        self.pooling = nn.MaxPool2d(kernel_size=2)
        self.fc = nn.ModuleList([nn.Linear(192,192),nn.Linear(192,10)])
        self.activation = nn.ReLU()
        self.bn = nn.ModuleList([nn.BatchNorm2d(6),nn.BatchNorm2d(24),nn.BatchNorm2d(12)])
        
    def forward(self, inp):
        residual = inp
        x = self.bn[0](self.conv[0](inp))
        x = x+residual
        x = self.pooling(self.activation(x))
        x = self.pooling(self.activation(self.bn[1](self.conv[1](x))))
        x = nn.Flatten()(self.activation(self.bn[2](self.conv[2](x))))
        res2 = x
        y = self.fc[0](x)
        y = y+res2
        y = self.activation(y)
        y = nn.Softmax(dim=-1)(self.fc[1](y))
        return y

In [9]:
net = CNN()
print(net)
net(torch.randn(20, 1, 32, 32))

CNN(
  (conv): ModuleList(
    (0): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Conv2d(6, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (2): Conv2d(24, 12, kernel_size=(5, 5), stride=(1, 1))
  )
  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): ModuleList(
    (0): Linear(in_features=192, out_features=192, bias=True)
    (1): Linear(in_features=192, out_features=10, bias=True)
  )
  (activation): ReLU()
)


tensor([[0.0631, 0.1010, 0.1147, 0.0946, 0.1355, 0.1112, 0.0405, 0.0585, 0.2029,
         0.0780],
        [0.0475, 0.0992, 0.0883, 0.0634, 0.1551, 0.0577, 0.0239, 0.0528, 0.1920,
         0.2201],
        [0.1259, 0.1470, 0.1011, 0.1420, 0.1098, 0.0932, 0.0559, 0.0546, 0.0928,
         0.0776],
        [0.0746, 0.1018, 0.1461, 0.0732, 0.0977, 0.1013, 0.0859, 0.0697, 0.1884,
         0.0614],
        [0.0670, 0.0988, 0.0753, 0.1209, 0.1242, 0.1377, 0.0501, 0.0726, 0.1303,
         0.1231],
        [0.0525, 0.1411, 0.1190, 0.0990, 0.1273, 0.1060, 0.0420, 0.0347, 0.1723,
         0.1063],
        [0.0653, 0.1611, 0.1133, 0.1292, 0.1492, 0.1292, 0.0426, 0.0285, 0.1235,
         0.0581],
        [0.0657, 0.1586, 0.0416, 0.1459, 0.1279, 0.0620, 0.0569, 0.1083, 0.1045,
         0.1286],
        [0.0398, 0.0838, 0.1870, 0.1211, 0.1329, 0.0888, 0.0661, 0.0714, 0.1463,
         0.0629],
        [0.1144, 0.1105, 0.0934, 0.0829, 0.0905, 0.0736, 0.0770, 0.0598, 0.1404,
         0.1575],
        [0

## using GPU resources

### checking available resources

In [10]:
torch.cuda.is_available()

True

To get the number of GPUs available.

In [11]:
torch.cuda.device_count()

4

In [12]:
torch.cuda.get_device_name(0)

'A100-SXM4-40GB'

### Move tensors to gpu

By default, the tensors are generated on the CPU. Even the model is initialized on the CPU. Thus one has to manually ensure that the operations are done using GPU. 


In [13]:
X_train = torch.FloatTensor([0., 1., 2.])
X_train.is_cuda

False

In [15]:
X_train.get_device()

-1

It's a common PyTorch practice to initialize a variable, usually named device that will hold the device we’re training on (CPU or GPU). 

In [19]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [20]:
X_train = X_train.to(device)
X_train.is_cuda

True

In [21]:
X_train.get_device()

0

The same logic applies to the model. 


In [22]:
model = CNN()
model.to(device)

CNN(
  (conv): ModuleList(
    (0): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Conv2d(6, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (2): Conv2d(24, 12, kernel_size=(5, 5), stride=(1, 1))
  )
  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): ModuleList(
    (0): Linear(in_features=192, out_features=192, bias=True)
    (1): Linear(in_features=192, out_features=10, bias=True)
  )
  (activation): ReLU()
)

### Move tensors back to CPU

In [23]:
X_train = X_train.cpu()
X_train.get_device()

-1

In [24]:
X = torch.FloatTensor([0., 1., 2.])
Y = torch.FloatTensor([0., 1., 2.])
X = X.cuda()

In [25]:
X+Y

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [26]:
Y = Y.to(device)

In [28]:
(X+Y).cpu().numpy()

array([0., 2., 4.], dtype=float32)

## MGCF cluster resources
https://docs.google.com/document/d/1lIkJ6g772Ss5e-4CJ_xGjlVRfOVUq6gYnyGiEhtBc-Q/edit?usp=sharing