# [Single-Machine Model Parallel Best Practices](https://pytorch.org/tutorials/intermediate/model_parallel_tutorial.html)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchsummary import summary 

In [2]:
# Models require space for 2 x features for forward and backward passes.

# features = int(1.515E4) # K620

# RTX 2080 Ti
features = int(3.75E4)
gpu_memory = 11264

In [3]:
class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = torch.nn.Linear(features, features).to('cuda:0')
        self.net2 = torch.nn.Linear(features, features).to('cuda:1')
        self.net3 = torch.nn.Linear(features, features).to('cuda:2')
        self.net4 = torch.nn.Linear(features, features).to('cuda:3')
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.relu(self.net1(x.to('cuda:0')))
        x = self.relu(self.net2(x.to('cuda:1')))
        x = self.relu(self.net3(x.to('cuda:2')))
        return self.net4(x.to('cuda:3'))

In [4]:
model = ToyModel()
input = torch.randn(20, features)

In [5]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))
print(f"total gpu space utilization: {size_all_mb/(gpu_memory*4)}")
summary(model, input_size=input.shape)

model size: 21458.244MB
total gpu space utilization: 0.4762571982362054
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1            [-1, 20, 37500]   1,406,287,500
              ReLU-2            [-1, 20, 37500]               0
            Linear-3            [-1, 20, 37500]   1,406,287,500
              ReLU-4            [-1, 20, 37500]               0
            Linear-5            [-1, 20, 37500]   1,406,287,500
              ReLU-6            [-1, 20, 37500]               0
            Linear-7            [-1, 20, 37500]   1,406,287,500
Total params: 5,625,150,000
Trainable params: 5,625,150,000
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 2.86
Forward/backward pass size (MB): 40.05
Params size (MB): 21458.24
Estimated Total Size (MB): 21501.16
----------------------------------------------------------------


In [6]:
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = model(input)
labels = torch.randn(20, features).to('cuda:3')
loss_fn(outputs, labels).backward()
optimizer.step()