In [3]:
import time

import torch
import torch.nn as nn

In [4]:
torch.cuda.is_available()

True

### Create Tensors Directly on the Target device

In [5]:
# Creating on CPU -> copying to GPU
start_time = time.time()
for _ in range(100):
    cpu_tensor = torch.ones((1000, 64, 64))
    gpu_tensor = cpu_tensor.cuda()

print(gpu_tensor.device)
print(f"Total time: {time.time() - start_time:.3f}s")

cuda:0
Total time: 0.982s


In [6]:
# Creating directly on GPU
start_time = time.time()
for _ in range(100):
    gpu_tensor = torch.ones((1000, 64, 64), device='cuda')

print(gpu_tensor.device)
print(f"Total time: {time.time() - start_time:.3f}s")

cuda:0
Total time: 0.101s


### Use Sequential Layers when possible

In [7]:
# without Sequential 
class ExampleModel(nn.Module):
    def __init__(self):
        super().__init__()

        input_size = 2
        output_size = 3
        hidden_size = 16

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.input_activation = nn.ReLU()

        self.mid_layer = nn.Linear(hidden_size, hidden_size)
        self.mid_activation = nn.ReLU()

        self.output_layer = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        z = self.input_layer(x)
        z = self.input_activation(z)

        z = self.mid_layer(z)
        z = self.mid_activation(z)

        out = self.output_layer(z)
        return out

In [8]:
example_model = ExampleModel()
print(example_model)
print(f"Output Shape: {example_model(torch.ones([100, 2])).shape}")

ExampleModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (mid_layer): Linear(in_features=16, out_features=16, bias=True)
  (mid_activation): ReLU()
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)
Output Shape: torch.Size([100, 3])


In [9]:
# with Sequential
class ExampleModel(nn.Module):
    def __init__(self):
        super().__init__()

        input_size = 2
        output_size = 3
        hidden_size = 16

        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        out = self.layers(x)
        return out


In [10]:
example_model = ExampleModel()
print(example_model)
print(f"Output Shape: {example_model(torch.ones([100, 2])).shape}")

ExampleModel(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=3, bias=True)
  )
)
Output Shape: torch.Size([100, 3])


### Don't Make Lists of Layers

In [11]:
class BadListModel(nn.Module):
    def __init__(self):
        super().__init__()

        input_size = 2
        output_size = 3
        hidden_size = 16

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.input_activation = nn.ReLU()

        # Generally used in residual layers
        self.mid_layers = []
        for _ in range(5):
            self.mid_layers.append(nn.Linear(hidden_size, hidden_size))
            self.mid_layers.append(nn.ReLU())

        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        z = self.input_layer(x)
        z = self.input_activation(z)

        for layer in self.mid_layers:
            z = layer(z)
        
        out = self.output_layer(z)
        return out

In [12]:
example_model = BadListModel()
print(example_model)
print(f"Output Shape: {example_model(torch.ones([100, 2])).shape}")

BadListModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)
Output Shape: torch.Size([100, 3])


Looks fine but `ERROR` occurs when moving model and input to GPU!

In [13]:
gpu_input = torch.ones([100, 2], device='cuda')
gpu_example_model = example_model.cuda()
print(f"Output Shape: {gpu_example_model(gpu_input).shape}")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

`ERROR` occurs because the internal hidden layers in the list are not tracked and thus not moved to GPU.

In [14]:
class CorrectListModel(nn.Module):
    def __init__(self):
        super().__init__()

        input_size = 2
        output_size = 3
        hidden_size = 16

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.input_activation = nn.ReLU()

        # Generally used in residual layers
        self.mid_layers = []
        for _ in range(5):
            self.mid_layers.append(nn.Linear(hidden_size, hidden_size))
            self.mid_layers.append(nn.ReLU())

        # pass the list through a Sequential layer to keep track and move to GPU automatically
        self.mid_layers = nn.Sequential(*self.mid_layers)
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        z = self.input_layer(x)
        z = self.input_activation(z)

        for layer in self.mid_layers:
            z = layer(z)
        
        out = self.output_layer(z)
        return out

In [15]:
example_model = CorrectListModel()
print(example_model)
print(f"Output Shape: {example_model(torch.ones([100, 2])).shape}")

gpu_input = torch.ones([100, 2], device='cuda')
gpu_example_model = example_model.cuda()
print(f"Output Shape: {gpu_example_model(gpu_input).shape}")

CorrectListModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (mid_layers): Sequential(
    (0): Linear(in_features=16, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=16, bias=True)
    (5): ReLU()
    (6): Linear(in_features=16, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=16, bias=True)
    (9): ReLU()
  )
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)
Output Shape: torch.Size([100, 3])
Output Shape: torch.Size([100, 3])


### Make Use of Distributions

In [16]:
# Setup
example_model = ExampleModel()
input_tensor = torch.rand(5, 2)
output = example_model(input_tensor)
print(output)

tensor([[ 0.1695,  0.1188, -0.1798],
        [ 0.1945,  0.1045, -0.1553],
        [ 0.1858,  0.1026, -0.1594],
        [ 0.2201,  0.0296, -0.1103],
        [ 0.2055,  0.0368, -0.1366]], grad_fn=<AddmmBackward0>)


In [17]:
from torch.distributions import Categorical
from torch.distributions.kl import kl_divergence

In [18]:
# without using softmax
dist = Categorical(logits=output)
dist

Categorical(logits: torch.Size([5, 3]))

In [19]:
dist.probs

tensor([[0.3765, 0.3579, 0.2655],
        [0.3819, 0.3490, 0.2691],
        [0.3805, 0.3501, 0.2694],
        [0.3929, 0.3248, 0.2824],
        [0.3914, 0.3306, 0.2780]], grad_fn=<SoftmaxBackward0>)

In [20]:
# same as softmax
e_x = torch.exp(output) 
e_x / torch.sum(e_x, dim=1, keepdim=True) 

tensor([[0.3765, 0.3579, 0.2655],
        [0.3819, 0.3490, 0.2691],
        [0.3805, 0.3501, 0.2694],
        [0.3929, 0.3248, 0.2823],
        [0.3914, 0.3306, 0.2780]], grad_fn=<DivBackward0>)

In [21]:
# sampling using the prob dist
dist.sample()

tensor([1, 2, 1, 1, 1])

In [22]:
# calcuate KL-Divergence
dist_1 = Categorical(logits=output[0])
dist_2 = Categorical(logits=output[1])
kl_divergence(dist_1, dist_2)

tensor(0.0002, grad_fn=<SumBackward1>)

### Use `detach()` on long-term Metrics

In [23]:
# Setup
example_model = ExampleModel()
data_batches = [torch.rand(10, 2) for _ in range(5)]
criterion = nn.MSELoss(reduction='mean')

In [24]:
# Bad Example
losses = []
# Training loop
for batch in data_batches:
    output = example_model(batch)
    target = torch.rand((10, 3))
    loss = criterion(output, target)
    losses.append(loss)
    # Optimization code here
print(losses)


[tensor(0.5188, grad_fn=<MseLossBackward0>), tensor(0.4238, grad_fn=<MseLossBackward0>), tensor(0.4574, grad_fn=<MseLossBackward0>), tensor(0.3861, grad_fn=<MseLossBackward0>), tensor(0.4921, grad_fn=<MseLossBackward0>)]


All the gradients for each loss is tracked and unnecessary. Could cause memory leaks!

Use:
- detach() - "detaches" the tensor from the computation graph and stores only the `tensor`
- item() - extracts the scalar value from the tensor and stores it (not the tensor)

In [25]:
# Better way
losses_detach = []
losses_item = []

# Training loop
for batch in data_batches:
    output = example_model(batch)
    target = torch.rand((10, 3))
    loss = criterion(output, target)
    losses_detach.append(loss.detach())
    losses_item.append(loss.item())
    # Optimization code here
print(losses_detach)
print(losses_item)

[tensor(0.4588), tensor(0.4431), tensor(0.3487), tensor(0.4100), tensor(0.4622)]
[0.458821564912796, 0.44308847188949585, 0.3487340211868286, 0.4100143015384674, 0.4622393548488617]


### Memory summary

In [26]:
# Built-in torch funcs
print(torch.cuda.memory_summary())
print(f"Memory allocated: {torch.cuda.memory_allocated() // 1024} KB")
print(f"Memory cached: {torch.cuda.memory_cached() // 1024} KB")
print(f"Memory cached: {torch.cuda.memory_reserved() // 1024} KB") # memory_cached renamed

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  24724 KiB |  32768 KiB |   3216 MiB |   3192 MiB |
|       from large pool |  24704 KiB |  32768 KiB |   3208 MiB |   3184 MiB |
|       from small pool |     20 KiB |   1084 KiB |      8 MiB |      8 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  24724 KiB |  32768 KiB |   3216 MiB |   3192 MiB |
|       from large pool |  24704 KiB |  32768 KiB |   3208 MiB |   3184 MiB |
|       from small pool |     20 KiB |   1084 KiB |      8 MiB |      8 MiB |
|---------------------------------------------------------------

  print(f"Memory cached: {torch.cuda.memory_cached() // 1024} KB")


### Trick to Delete a Model from GPU

In [27]:
def print_stats():
    print(f"Memory allocated: {torch.cuda.memory_allocated() // 1024} KB")
    print(f"Memory cached: {torch.cuda.memory_reserved() // 1024} KB") # pt rename memory_cached -> memory_reserved

In [28]:
example_model = ExampleModel().cuda()
print_stats()
del example_model
print_stats()

Memory allocated: 24727 KB
Memory cached: 34816 KB
Memory allocated: 24724 KB
Memory cached: 34816 KB


Cache is not deleted!

In [29]:
import gc

example_model = ExampleModel().cuda()
print("Model initialized")
print_stats()

del example_model
print("\nModel deleted")
print_stats()

gc.collect() # good practice
print("\nGC")
print_stats()

torch.cuda.empty_cache()
print("\nEmptied cache!")
print_stats()

Model initialized
Memory allocated: 24727 KB
Memory cached: 34816 KB

Model deleted
Memory allocated: 24724 KB
Memory cached: 34816 KB

GC
Memory allocated: 24724 KB
Memory cached: 34816 KB

Emptied cache!
Memory allocated: 24724 KB
Memory cached: 34816 KB


### Call `eval()` before Testing

In [30]:
example_model = ExampleModel()

# Training mode
example_model.train()

# Testing mode
example_model.eval() # OR example_model.train(mode=False)

ExampleModel(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=3, bias=True)
  )
)

**NOTE**: 

`model.eval()` notifies all layers - affects layers like InstanceNorm, BatchNorm, Dropout (since they behave differently in training and testing phases)

`with torch.no_grad():` affects only the autograd engine - deactivates it and speeds up computation!

### Overfit a single batch before actual training run

### Call `.zero_grad()` before `.backward()` (but only once before gradient accumalation)

### Don't use `Softmax` when using `CrossEntropy`

Already uses it inside the PyTorch loss function implementation.

### Bias term not need with BatchNorm (just unneccessary, nothing breaks!)

### `View` vs `Permute`

### Careful when applying Data Augmentation

- Flipping and rotating (180 degs) in digit dataset - check `6` vs `9` confusion matrix

### Shuffle the Data!

### Normalize the Data!

### Gradient (Norm) Clipping